diff --git a/build.cmd b/build.cmd index 1f98b3c4..b78904b5 100644 --- a/build.cmd +++ b/build.cmd @@ -46,7 +46,7 @@ if /i [%1] == [--skipDotNetBridge] ( echo "Usage: build.cmd [--configuration ] [--runTests] [--buildDotNetBridgeOnly] [--skipDotNetBridge]" echo "" echo "Options:" -echo " --configuration Build Configuration (DbgWinPy3.6,DbgWinPy3.5,DbgWinPy2.7,RlsWinPy3.6,RlsWinPy3.5,RlsWinPy2.7)" +echo " --configuration Build Configuration (DbgWinPy3.7,DbgWinPy3.6,DbgWinPy3.5,DbgWinPy2.7,RlsWinPy3.7,RlsWinPy3.6,RlsWinPy3.5,RlsWinPy2.7)" echo " --runTests Run tests after build" echo " --buildDotNetBridgeOnly Build only DotNetBridge" echo " --skipDotNetBridge Build everything except DotNetBridge" diff --git a/build/ci/phase-template.yml b/build/ci/phase-template.yml index e4e02f57..ce357221 100644 --- a/build/ci/phase-template.yml +++ b/build/ci/phase-template.yml @@ -24,7 +24,7 @@ phases: - script: $(_buildScript) --configuration $(_configuration) --runTests # Mac phases - ${{ if eq(parameters.name, 'Mac') }}: - - script: brew update && brew install libomp mono-libgdiplus gettext && brew link gettext --force + - script: brew update && brew install https://raw.githubusercontent.com/Homebrew/homebrew-core/f5b1ac99a7fba27c19cee0bc4f036775c889b359/Formula/libomp.rb mono-libgdiplus gettext && brew link gettext --force - ${{ if eq(parameters.testDistro, 'noTests') }}: - script: chmod 777 $(_buildScript) && $(_buildScript) --configuration $(_configuration) - ${{ if eq(parameters.testDistro, '') }}: diff --git a/build/libs_linux.txt b/build/libs_linux.txt index 3bbde144..c5e38f5a 100644 --- a/build/libs_linux.txt +++ b/build/libs_linux.txt @@ -1,4 +1,3 @@ -Google.Protobuf.dll Newtonsoft.Json.dll libCpuMathNative.so libFactorizationMachineNative.so diff --git a/build/libs_mac.txt b/build/libs_mac.txt index 7373bb8f..efb3e632 100644 --- a/build/libs_mac.txt +++ b/build/libs_mac.txt @@ -1,4 +1,3 @@ -Google.Protobuf.dll Newtonsoft.Json.dll libCpuMathNative.dylib libFactorizationMachineNative.dylib diff --git a/build/libs_win.txt b/build/libs_win.txt index 54854ace..3359f7cd 100644 --- a/build/libs_win.txt +++ b/build/libs_win.txt @@ -5,6 +5,7 @@ FactorizationMachineNative.dll FastTreeNative.dll LdaNative.dll lib_lightgbm.dll +libiomp5md.dll MklImports.dll SymSgdNative.dll tensorflow.dll diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index 14475302..1395c998 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -10,14 +10,12 @@ using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; -using Microsoft.ML.ImageAnalytics; -using Microsoft.ML.LightGBM; -using Microsoft.ML.Model.Onnx; +using Microsoft.ML.Model.OnnxConverter; +using Microsoft.ML.Runtime; using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.Ensemble; using Microsoft.ML.Trainers.FastTree; -using Microsoft.ML.Trainers.KMeans; -using Microsoft.ML.Trainers.PCA; -using Microsoft.ML.Trainers.SymSgd; +using Microsoft.ML.Trainers.LightGbm; using Microsoft.ML.Transforms; namespace Microsoft.MachineLearning.DotNetBridge @@ -307,107 +305,110 @@ private static unsafe IntPtr GetFn(FnId id) /// private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cdata, DataSourceBlock** ppdata) { - using (var env = new RmlEnvironment(MarshalDelegate(penv->checkCancel), penv->seed, - verbose: penv != null && penv->verbosity > 3, conc: penv != null ? penv->maxThreadsAllowed : 0)) + var env = new RmlEnvironment(MarshalDelegate(penv->checkCancel), penv->seed, verbose: penv != null && penv->verbosity > 3); + var host = env.Register("ML.NET_Execution"); + + env.ComponentCatalog.RegisterAssembly(typeof(TextLoader).Assembly); // ML.Data + env.ComponentCatalog.RegisterAssembly(typeof(LinearModelParameters).Assembly); // ML.StandardLearners + env.ComponentCatalog.RegisterAssembly(typeof(CategoricalCatalog).Assembly); // ML.Transforms + env.ComponentCatalog.RegisterAssembly(typeof(FastTreeRegressionTrainer).Assembly); // ML.FastTree + + //env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble + env.ComponentCatalog.RegisterAssembly(typeof(KMeansModelParameters).Assembly); // ML.KMeansClustering + env.ComponentCatalog.RegisterAssembly(typeof(PcaModelParameters).Assembly); // ML.PCA + env.ComponentCatalog.RegisterAssembly(typeof(CVSplit).Assembly); // ML.EntryPoints + + env.ComponentCatalog.RegisterAssembly(typeof(OlsModelParameters).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(LightGbmBinaryModelParameters).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(TensorFlowTransformer).Assembly); + //env.ComponentCatalog.RegisterAssembly(typeof(SymSgdClassificationTrainer).Assembly); + //env.ComponentCatalog.RegisterAssembly(typeof(AutoInference).Assembly); // ML.PipelineInference + env.ComponentCatalog.RegisterAssembly(typeof(DataViewReference).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(ImageLoadingTransformer).Assembly); + //env.ComponentCatalog.RegisterAssembly(typeof(SaveOnnxCommand).Assembly); + //env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly); + //env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly); + + using (var ch = host.Start("Executing")) { - var host = env.Register("ML.NET_Execution"); - env.ComponentCatalog.RegisterAssembly(typeof(TextLoader).Assembly); // ML.Data - env.ComponentCatalog.RegisterAssembly(typeof(StochasticGradientDescentClassificationTrainer).Assembly); // ML.StandardLearners - env.ComponentCatalog.RegisterAssembly(typeof(CategoricalCatalog).Assembly); // ML.Transforms - env.ComponentCatalog.RegisterAssembly(typeof(FastTreeRegressionTrainer).Assembly); // ML.FastTree - env.ComponentCatalog.RegisterAssembly(typeof(KMeansPlusPlusTrainer).Assembly); // ML.KMeansClustering - env.ComponentCatalog.RegisterAssembly(typeof(RandomizedPcaTrainer).Assembly); // ML.PCA - //env.ComponentCatalog.RegisterAssembly(typeof(Experiment).Assembly); // ML.Legacy - env.ComponentCatalog.RegisterAssembly(typeof(LightGbmRegressorTrainer).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(TensorFlowTransformer).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(ImageLoaderTransformer).Assembly); - env.ComponentCatalog.RegisterAssembly(typeof(SymSgdClassificationTrainer).Assembly); - //env.ComponentCatalog.RegisterAssembly(typeof(AutoInference).Assembly); // ML.PipelineInference - env.ComponentCatalog.RegisterAssembly(typeof(OnnxExportExtensions).Assembly); // ML.Onnx - env.ComponentCatalog.RegisterAssembly(typeof(DataViewReference).Assembly); - //env.ComponentCatalog.RegisterAssembly(typeof(EnsemblePredictor).Assembly); // // ML.Ensemble BUG https://github.com/dotnet/machinelearning/issues/1078 Ensemble isn't in a NuGet package - - using (var ch = host.Start("Executing")) + var sw = new System.Diagnostics.Stopwatch(); + sw.Start(); + try { - var sw = new System.Diagnostics.Stopwatch(); - sw.Start(); - try - { - // code, pszIn, and pszOut can be null. - ch.Trace("Checking parameters"); + // code, pszIn, and pszOut can be null. + ch.Trace("Checking parameters"); - host.CheckParam(penv != null, nameof(penv)); - host.CheckParam(penv->messageSink != null, "penv->message"); + host.CheckParam(penv != null, nameof(penv)); + host.CheckParam(penv->messageSink != null, "penv->message"); - host.CheckParam(psz != null, nameof(psz)); + host.CheckParam(psz != null, nameof(psz)); - ch.Trace("Converting graph operands"); - var graph = BytesToString(psz); + ch.Trace("Converting graph operands"); + var graph = BytesToString(psz); - ch.Trace("Wiring message sink"); - var message = MarshalDelegate(penv->messageSink); - var messageValidator = new MessageValidator(host); - var lk = new object(); - Action listener = - (sender, msg) => + ch.Trace("Wiring message sink"); + var message = MarshalDelegate(penv->messageSink); + var messageValidator = new MessageValidator(host); + var lk = new object(); + Action listener = + (sender, msg) => + { + byte[] bs = StringToNullTerminatedBytes(sender.FullName); + string m = messageValidator.Validate(msg); + if (!string.IsNullOrEmpty(m)) { - byte[] bs = StringToNullTerminatedBytes(sender.FullName); - string m = messageValidator.Validate(msg); - if (!string.IsNullOrEmpty(m)) + byte[] bm = StringToNullTerminatedBytes(m); + lock (lk) { - byte[] bm = StringToNullTerminatedBytes(m); - lock (lk) - { - fixed (byte* ps = bs) - fixed (byte* pm = bm) - message(penv, msg.Kind, (sbyte*)ps, (sbyte*)pm); - } + fixed (byte* ps = bs) + fixed (byte* pm = bm) + message(penv, msg.Kind, (sbyte*)ps, (sbyte*)pm); } - }; - env.AddListener(listener); + } + }; + env.AddListener(listener); - host.CheckParam(cdata >= 0, nameof(cdata), "must be non-negative"); - host.CheckParam(ppdata != null || cdata == 0, nameof(ppdata)); - for (int i = 0; i < cdata; i++) + host.CheckParam(cdata >= 0, nameof(cdata), "must be non-negative"); + host.CheckParam(ppdata != null || cdata == 0, nameof(ppdata)); + for (int i = 0; i < cdata; i++) + { + var pdata = ppdata[i]; + host.CheckParam(pdata != null, "pdata"); + host.CheckParam(0 <= pdata->ccol && pdata->ccol <= int.MaxValue, "ccol"); + host.CheckParam(0 <= pdata->crow && pdata->crow <= long.MaxValue, "crow"); + if (pdata->ccol > 0) { - var pdata = ppdata[i]; - host.CheckParam(pdata != null, "pdata"); - host.CheckParam(0 <= pdata->ccol && pdata->ccol <= int.MaxValue, "ccol"); - host.CheckParam(0 <= pdata->crow && pdata->crow <= long.MaxValue, "crow"); - if (pdata->ccol > 0) - { - host.CheckParam(pdata->names != null, "names"); - host.CheckParam(pdata->kinds != null, "kinds"); - host.CheckParam(pdata->keyCards != null, "keyCards"); - host.CheckParam(pdata->vecCards != null, "vecCards"); - host.CheckParam(pdata->getters != null, "getters"); - } + host.CheckParam(pdata->names != null, "names"); + host.CheckParam(pdata->kinds != null, "kinds"); + host.CheckParam(pdata->keyCards != null, "keyCards"); + host.CheckParam(pdata->vecCards != null, "vecCards"); + host.CheckParam(pdata->getters != null, "getters"); } + } - ch.Trace("Validating number of data sources"); + ch.Trace("Validating number of data sources"); - // Wrap the data sets. - ch.Trace("Wrapping native data sources"); - ch.Trace("Executing"); - ExecCore(penv, host, ch, graph, cdata, ppdata); - } - catch (Exception e) - { - // Dump the exception chain. - var ex = e; - while (ex.InnerException != null) - ex = ex.InnerException; - ch.Error("*** {1}: '{0}'", ex.Message, ex.GetType()); - return -1; - } - finally - { - sw.Stop(); - if (penv != null && penv->verbosity > 0) - ch.Info("Elapsed time: {0}", sw.Elapsed); - else - ch.Trace("Elapsed time: {0}", sw.Elapsed); - } + // Wrap the data sets. + ch.Trace("Wrapping native data sources"); + ch.Trace("Executing"); + ExecCore(penv, host, ch, graph, cdata, ppdata); + } + catch (Exception e) + { + // Dump the exception chain. + var ex = e; + while (ex.InnerException != null) + ex = ex.InnerException; + ch.Error("*** {1}: '{0}'", ex.Message, ex.GetType()); + return -1; + } + finally + { + sw.Stop(); + if (penv != null && penv->verbosity > 0) + ch.Info("Elapsed time: {0}", sw.Elapsed); + else + ch.Trace("Elapsed time: {0}", sw.Elapsed); } } return 0; diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index e9ecab39..1c1cb0e6 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -31,13 +31,15 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - + + + + + + + + + + diff --git a/src/DotNetBridge/MessageValidator.cs b/src/DotNetBridge/MessageValidator.cs index 0464319e..2aa78c27 100644 --- a/src/DotNetBridge/MessageValidator.cs +++ b/src/DotNetBridge/MessageValidator.cs @@ -5,7 +5,7 @@ using System; using System.Globalization; -using Microsoft.ML; +using Microsoft.ML.Runtime; namespace Microsoft.MachineLearning.DotNetBridge { diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs index ca233d6f..c9b70526 100644 --- a/src/DotNetBridge/NativeDataInterop.cs +++ b/src/DotNetBridge/NativeDataInterop.cs @@ -5,11 +5,13 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Globalization; using System.Runtime.InteropServices; using System.Text; using Microsoft.ML; using Microsoft.ML.Data; +using Microsoft.ML.Runtime; namespace Microsoft.MachineLearning.DotNetBridge { @@ -32,7 +34,7 @@ private struct DataSourceBlock [FieldOffset(0x18)] public readonly sbyte** names; [FieldOffset(0x20)] - public readonly DataKind* kinds; + public readonly InternalDataKind* kinds; [FieldOffset(0x28)] public readonly long* keyCards; [FieldOffset(0x30)] @@ -69,7 +71,7 @@ private struct DataViewBlock // Column data kinds. [FieldOffset(0x18)] - public DataKind* kinds; + public InternalDataKind* kinds; // For columns that have key type, these contain the cardinalities of the // key types. Zero means unbounded, -1 means not a key type. @@ -107,7 +109,7 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, var schema = view.Schema; var colIndices = new List(); - var kindList = new List(); + var kindList = new List(); var keyCardList = new List(); var nameUtf8Bytes = new List(); var nameIndices = new List(); @@ -121,71 +123,71 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, continue; var fullType = schema[col].Type; - var itemType = fullType.ItemType; + var itemType = fullType.GetItemType(); var name = schema[col].Name; - DataKind kind = itemType.RawKind; + var kind = itemType.GetRawKind(); int keyCard; - if (fullType.ValueCount == 0) + if (fullType.GetValueCount() == 0) { throw ch.ExceptNotSupp("Column has variable length vector: " + name + ". Not supported in python. Drop column before sending to Python"); } - if (itemType.IsKey) + if (itemType is KeyDataViewType) { // Key types are returned as their signed counterparts in Python, so that -1 can be the missing value. // For U1 and U2 kinds, we convert to a larger type to prevent overflow. For U4 and U8 kinds, we convert // to I4 if the key count is known (since KeyCount is an I4), and to I8 otherwise. switch (kind) { - case DataKind.U1: - kind = DataKind.I2; + case InternalDataKind.U1: + kind = InternalDataKind.I2; break; - case DataKind.U2: - kind = DataKind.I4; + case InternalDataKind.U2: + kind = InternalDataKind.I4; break; - case DataKind.U4: + case InternalDataKind.U4: // We convert known-cardinality U4 key types to I4. - kind = itemType.KeyCount > 0 ? DataKind.I4 : DataKind.I8; + kind = itemType.GetKeyCount() > 0 ? InternalDataKind.I4 : InternalDataKind.I8; break; - case DataKind.U8: + case InternalDataKind.U8: // We convert known-cardinality U8 key types to I4. - kind = itemType.KeyCount > 0 ? DataKind.I4 : DataKind.I8; + kind = itemType.GetKeyCount() > 0 ? InternalDataKind.I4 : InternalDataKind.I8; break; } - keyCard = itemType.KeyCount; - if (!schema[col].HasKeyValues(keyCard)) + keyCard = itemType.GetKeyCountAsInt32(); + if (!schema[col].HasKeyValues()) keyCard = -1; } else if (itemType.IsStandardScalar()) { - switch (itemType.RawKind) + switch (itemType.GetRawKind()) { default: - throw Contracts.Except("Data type {0} not handled", itemType.RawKind); - - case DataKind.I1: - case DataKind.I2: - case DataKind.I4: - case DataKind.I8: - case DataKind.U1: - case DataKind.U2: - case DataKind.U4: - case DataKind.U8: - case DataKind.R4: - case DataKind.R8: - case DataKind.BL: - case DataKind.TX: + throw Contracts.Except("Data type {0} not handled", itemType.GetRawKind()); + + case InternalDataKind.I1: + case InternalDataKind.I2: + case InternalDataKind.I4: + case InternalDataKind.I8: + case InternalDataKind.U1: + case InternalDataKind.U2: + case InternalDataKind.U4: + case InternalDataKind.U8: + case InternalDataKind.R4: + case InternalDataKind.R8: + case InternalDataKind.BL: + case InternalDataKind.TX: break; } keyCard = -1; } else { - throw Contracts.Except("Data type {0} not handled", itemType.RawKind); + throw Contracts.Except("Data type {0} not handled", itemType.GetRawKind()); } int nSlots; @@ -193,8 +195,8 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, if (infos != null && infos.TryGetValue(name, out info) && info.Expand) { expandCols.Add(col); - Contracts.Assert(fullType.IsKnownSizeVector); - nSlots = fullType.VectorSize; + Contracts.Assert(fullType.IsKnownSizeVector()); + nSlots = fullType.GetVectorSize(); if (info.SlotNames != null) { Contracts.Assert(info.SlotNames.Length == nSlots); @@ -204,7 +206,7 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, else if (schema[col].HasSlotNames(nSlots)) { var romNames = default(VBuffer>); - schema[col].Metadata.GetValue(MetadataUtils.Kinds.SlotNames, ref romNames); + schema[col].Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref romNames); foreach (var kvp in romNames.Items(true)) { // REVIEW: Add the proper number of zeros to the slot index to make them sort in the right order. @@ -242,7 +244,7 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, var nameBytes = nameUtf8Bytes.ToArray(); var names = new byte*[allNames.Count]; - fixed (DataKind* prgkind = kinds) + fixed (InternalDataKind* prgkind = kinds) fixed (byte* prgbNames = nameBytes) fixed (byte** prgname = names) fixed (int* prgkeyCard = keyCards) @@ -266,7 +268,7 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, } ch.Assert(keyValueSetter != null); var kvSet = MarshalDelegate(keyValueSetter); - using (var cursor = view.GetRowCursor(colIndices.Contains)) + using (var cursor = view.GetRowCursor(view.Schema.Where(col => colIndices.Contains(col.Index)))) { var fillers = new BufferFillerBase[colIndices.Count]; var pyColumn = 0; @@ -274,12 +276,13 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, for (int i = 0; i < colIndices.Count; i++) { var type = schema[colIndices[i]].Type; - if (type.ItemType.IsKey && schema[colIndices[i]].HasKeyValues(type.ItemType.KeyCount)) + var itemType = type.GetItemType(); + if ((itemType is KeyDataViewType) && schema[colIndices[i]].HasKeyValues()) { - ch.Assert(schema[colIndices[i]].HasKeyValues(type.ItemType.KeyCount)); + ch.Assert(schema[colIndices[i]].HasKeyValues()); var keyValues = default(VBuffer>); - schema[colIndices[i]].Metadata.GetValue(MetadataUtils.Kinds.KeyValues, ref keyValues); - for (int slot = 0; slot < type.ValueCount; slot++) + schema[colIndices[i]].Annotations.GetValue(AnnotationUtils.Kinds.KeyValues, ref keyValues); + for (int slot = 0; slot < type.GetValueCount(); slot++) { foreach (var kvp in keyValues.Items()) { @@ -296,7 +299,7 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, } } fillers[i] = BufferFillerBase.Create(penv, cursor, pyColumn, colIndices[i], kinds[pyColumn], type, setters[pyColumn]); - pyColumn += type.IsVector ? type.VectorSize : 1; + pyColumn += type is VectorDataViewType ? type.GetVectorSize() : 1; } for (int crow = 0; ; crow++) { @@ -333,40 +336,40 @@ private abstract unsafe class BufferFillerBase public delegate void ValuePoker(T value, int col, long index); protected readonly int _colIndex; - protected readonly Row _input; + protected readonly DataViewRow _input; - protected BufferFillerBase(Row input, int pyColIndex) + protected BufferFillerBase(DataViewRow input, int pyColIndex) { _colIndex = pyColIndex; _input = input; } - public static BufferFillerBase Create(EnvironmentBlock* penv, Row input, int pyCol, int idvCol, DataKind dataKind, ColumnType type, void* setter) + public static BufferFillerBase Create(EnvironmentBlock* penv, DataViewRow input, int pyCol, int idvCol, InternalDataKind dataKind, DataViewType type, void* setter) { - var itemType = type.ItemType; + var itemType = type.GetItemType(); // We convert the unsigned types to signed types, with -1 indicating missing in Python. - if (itemType.KeyCount > 0) + if (itemType.GetKeyCount() > 0) { - var keyCount = itemType.KeyCount; + var keyCount = itemType.GetKeyCount(); uint keyMax = (uint)keyCount; - switch (itemType.RawKind) + switch (itemType.GetRawKind()) { - case DataKind.U1: + case InternalDataKind.U1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeU1 = (byte value, int col, long index) => fnI1(penv, col, index, value > keyMax ? (sbyte)-1 : (sbyte)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU1); - case DataKind.U2: + case InternalDataKind.U2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeU2 = (ushort value, int col, long index) => fnI2(penv, col, index, value > keyMax ? (short)-1 : (short)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU2); - case DataKind.U4: + case InternalDataKind.U4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeU4 = (uint value, int col, long index) => fnI4(penv, col, index, value > keyMax ? -1 : (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU4); - case DataKind.U8: + case InternalDataKind.U8: // We convert U8 key types with key names to I4. fnI4 = MarshalDelegate(setter); ValuePoker pokeU8 = @@ -375,26 +378,26 @@ public static BufferFillerBase Create(EnvironmentBlock* penv, Row input, int pyC } } // Key type with count=0 - else if (itemType.IsKey) + else if (itemType is KeyDataViewType) { - switch (itemType.RawKind) + switch (itemType.GetRawKind()) { - case DataKind.U1: + case InternalDataKind.U1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeU1 = (byte value, int col, long index) => fnI1(penv, col, index, (sbyte)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU1); - case DataKind.U2: + case InternalDataKind.U2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeU2 = (ushort value, int col, long index) => fnI2(penv, col, index, (short)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU2); - case DataKind.U4: + case InternalDataKind.U4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeU4 = (uint value, int col, long index) => fnI4(penv, col, index, (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU4); - case DataKind.U8: + case InternalDataKind.U8: // We convert U8 key types with key names to I4. fnI4 = MarshalDelegate(setter); ValuePoker pokeU8 = @@ -406,62 +409,62 @@ public static BufferFillerBase Create(EnvironmentBlock* penv, Row input, int pyC { switch (dataKind) { - case DataKind.R4: + case InternalDataKind.R4: var fnR4 = MarshalDelegate(setter); ValuePoker pokeR4 = (float value, int col, long index) => fnR4(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeR4); - case DataKind.R8: + case InternalDataKind.R8: var fnR8 = MarshalDelegate(setter); ValuePoker pokeR8 = (double value, int col, long index) => fnR8(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeR8); - case DataKind.BL: + case InternalDataKind.BL: var fnBl = MarshalDelegate(setter); ValuePoker pokeBl = (bool value, int col, long index) => fnBl(penv, col, index, !value ? (byte)0 : value ? (byte)1 : (byte)0xFF); return new Impl(input, pyCol, idvCol, type, pokeBl); - case DataKind.I1: + case InternalDataKind.I1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeI1 = (sbyte value, int col, long index) => fnI1(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeI1); - case DataKind.I2: + case InternalDataKind.I2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeI2 = (short value, int col, long index) => fnI2(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeI2); - case DataKind.I4: + case InternalDataKind.I4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeI4 = (int value, int col, long index) => fnI4(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeI4); - case DataKind.I8: + case InternalDataKind.I8: var fnI8 = MarshalDelegate(setter); ValuePoker pokeI8 = (long value, int col, long index) => fnI8(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeI8); - case DataKind.U1: + case InternalDataKind.U1: var fnU1 = MarshalDelegate(setter); ValuePoker pokeU1 = (byte value, int col, long index) => fnU1(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeU1); - case DataKind.U2: + case InternalDataKind.U2: var fnU2 = MarshalDelegate(setter); ValuePoker pokeU2 = (ushort value, int col, long index) => fnU2(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeU2); - case DataKind.U4: + case InternalDataKind.U4: var fnU4 = MarshalDelegate(setter); ValuePoker pokeU4 = (uint value, int col, long index) => fnU4(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeU4); - case DataKind.U8: + case InternalDataKind.U8: var fnU8 = MarshalDelegate(setter); ValuePoker pokeU8 = (ulong value, int col, long index) => fnU8(penv, col, index, value); return new Impl(input, pyCol, idvCol, type, pokeU8); - case DataKind.TX: + case InternalDataKind.TX: var fnTX = MarshalDelegate(setter); ValuePoker> pokeTX = (ReadOnlyMemory value, int col, long index) => @@ -494,14 +497,14 @@ private sealed class Impl : BufferFillerBase private readonly ValueGetter _get; private readonly ValuePoker _poker; - public Impl(Row input, int pyColIndex, int idvColIndex, ColumnType type, ValuePoker poker) + public Impl(DataViewRow input, int pyColIndex, int idvColIndex, DataViewType type, ValuePoker poker) : base(input, pyColIndex) { Contracts.AssertValue(input); Contracts.Assert(0 <= idvColIndex && idvColIndex < input.Schema.Count); - if (type.IsVector) - _getVec = RowCursorUtils.GetVecGetterAs((PrimitiveType)type.ItemType, input, idvColIndex); + if (type is VectorDataViewType) + _getVec = RowCursorUtils.GetVecGetterAs((PrimitiveDataViewType)type.GetItemType(), input, idvColIndex); else _get = RowCursorUtils.GetGetterAs(type, input, idvColIndex); diff --git a/src/DotNetBridge/NativeDataView.cs b/src/DotNetBridge/NativeDataView.cs index 5c766745..09796203 100644 --- a/src/DotNetBridge/NativeDataView.cs +++ b/src/DotNetBridge/NativeDataView.cs @@ -11,6 +11,8 @@ using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Internal.Utilities; +using System.Threading.Tasks; +using Microsoft.ML.Runtime; namespace Microsoft.MachineLearning.DotNetBridge { @@ -32,7 +34,7 @@ private sealed class NativeDataView : IDataView, IDisposable /// This is a by-product of using the new API. As a compromise, /// instead of changing all derived classes, /// we decided to keep this duplicate piece of data as a quick solution. - public Schema Schema { get; } + public DataViewSchema Schema { get; } public NativeDataView(IHostEnvironment env, DataSourceBlock* pdata) { @@ -57,29 +59,29 @@ public NativeDataView(IHostEnvironment env, DataSourceBlock* pdata) default: _host.Assert(false); break; - case DataKind.BL: + case InternalDataKind.BL: if (pdata->vecCards[c] == -1) columns.Add(new BoolColumn(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorBoolColumn(pdata, pdata->getters[c], c, name, new VectorType(BoolType.Instance, (int)pdata->vecCards[c]))); + columns.Add(new VectorBoolColumn(pdata, pdata->getters[c], c, name, new VectorDataViewType(BooleanDataViewType.Instance, (int)pdata->vecCards[c]))); break; - case DataKind.U1: + case InternalDataKind.U1: // catch if categoricals are passed by other than U4 types Contracts.Assert(pdata->keyCards[c] <= 0); if (pdata->vecCards[c] == -1) columns.Add(new U1Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorUInt1Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.U1, (int)pdata->vecCards[c]))); + columns.Add(new VectorUInt1Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.Byte, (int)pdata->vecCards[c]))); break; - case DataKind.U2: + case InternalDataKind.U2: // catch if categoricals are passed by other than U4 types Contracts.Assert(pdata->keyCards[c] <= 0); if (pdata->vecCards[c] == -1) columns.Add(new U2Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorUInt2Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.U2, (int)pdata->vecCards[c]))); + columns.Add(new VectorUInt2Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.UInt16, (int)pdata->vecCards[c]))); break; - case DataKind.U4: + case InternalDataKind.U4: if (pdata->keyCards[c] > 0) { // Categoricals from python are passed as U4 type @@ -92,62 +94,62 @@ public NativeDataView(IHostEnvironment env, DataSourceBlock* pdata) else if (pdata->vecCards[c] == -1) columns.Add(new U4Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorUInt4Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.U4, (int)pdata->vecCards[c]))); + columns.Add(new VectorUInt4Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.UInt32, (int)pdata->vecCards[c]))); break; - case DataKind.U8: + case InternalDataKind.U8: // catch if categoricals are passed by other than U4 types Contracts.Assert(pdata->keyCards[c] <= 0); if (pdata->vecCards[c] == -1) columns.Add(new U8Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorUInt8Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.U8, (int)pdata->vecCards[c]))); + columns.Add(new VectorUInt8Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.Double, (int)pdata->vecCards[c]))); break; - case DataKind.I1: + case InternalDataKind.I1: if (pdata->vecCards[c] == -1) columns.Add(new I1Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorInt1Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.I1, (int)pdata->vecCards[c]))); + columns.Add(new VectorInt1Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.SByte, (int)pdata->vecCards[c]))); break; - case DataKind.I2: + case InternalDataKind.I2: if (pdata->vecCards[c] == -1) columns.Add(new I2Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorInt2Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.I2, (int)pdata->vecCards[c]))); + columns.Add(new VectorInt2Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.Int16, (int)pdata->vecCards[c]))); break; - case DataKind.I4: + case InternalDataKind.I4: if (pdata->vecCards[c] == -1) columns.Add(new I4Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorInt4Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.I4, (int)pdata->vecCards[c]))); + columns.Add(new VectorInt4Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.Int32, (int)pdata->vecCards[c]))); break; - case DataKind.I8: + case InternalDataKind.I8: if (pdata->vecCards[c] == -1) columns.Add(new I8Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorInt8Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.I8, (int)pdata->vecCards[c]))); + columns.Add(new VectorInt8Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.Int64, (int)pdata->vecCards[c]))); break; - case DataKind.R8: + case InternalDataKind.R8: if (pdata->vecCards[c] == -1) columns.Add(new R8Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorR8Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.R8, (int)pdata->vecCards[c]))); + columns.Add(new VectorR8Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.Double, (int)pdata->vecCards[c]))); break; - case DataKind.R4: + case InternalDataKind.R4: if (pdata->vecCards[c] == -1) columns.Add(new R4Column(pdata, pdata->getters[c], c, name)); else - columns.Add(new VectorR4Column(pdata, pdata->getters[c], c, name, new VectorType(NumberType.R4, (int)pdata->vecCards[c]))); + columns.Add(new VectorR4Column(pdata, pdata->getters[c], c, name, new VectorDataViewType(NumberDataViewType.Single, (int)pdata->vecCards[c]))); break; - case DataKind.Text: + case InternalDataKind.Text: columns.Add(new TextColumn(pdata, pdata->getters[c], c, name)); break; } } _columns = columns.ToArray(); - var schemaBuilder = new SchemaBuilder(); + var schemaBuilder = new DataViewSchema.Builder(); schemaBuilder.AddColumns(columns.Select(c => c.DetachedColumn)); - Schema = schemaBuilder.GetSchema(); + Schema = schemaBuilder.ToSchema(); } public long? GetRowCount() @@ -155,21 +157,21 @@ public NativeDataView(IHostEnvironment env, DataSourceBlock* pdata) return _rowCount; } - public RowCursor GetRowCursor(Func needCol, Random rand = null) + public DataViewRowCursor GetRowCursor(IEnumerable columnsNeeded, Random rand = null) { - _host.CheckValue(needCol, nameof(needCol)); + _host.CheckValue(columnsNeeded, nameof(columnsNeeded)); _host.CheckValueOrNull(rand); - var active = Utils.BuildArray(_columns.Length, needCol); + var active = Utils.BuildArray(_columns.Length, columnsNeeded); return NativeRowCursor.CreateSet(_host, this, active, 1, rand)[0]; } - public RowCursor[] GetRowCursorSet(Func needCol, int n, Random rand = null) + public DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null) { - _host.CheckValue(needCol, nameof(needCol)); + _host.CheckValue(columnsNeeded, nameof(columnsNeeded)); _host.CheckValueOrNull(rand); - var active = Utils.BuildArray(_columns.Length, needCol); + var active = Utils.BuildArray(_columns.Length, columnsNeeded); return NativeRowCursor.CreateSet(_host, this, active, n, rand); } @@ -218,7 +220,7 @@ private sealed class NativeRowCursor : RootCursorBase private bool _justLoaded; private bool _disposed; - public override Schema Schema => _view.Schema; + public override DataViewSchema Schema => _view.Schema; public override long Batch => _batchId; @@ -238,10 +240,10 @@ private NativeRowCursor(IChannelProvider provider, NativeDataView view, bool[] a _justLoaded = false; } - public override ValueGetter GetGetter(int col) + public override ValueGetter GetGetter(DataViewSchema.Column col) { - Ch.CheckParam(_active[col], nameof(col), "column is not active"); - var column = _view._columns[col] as Column; + Ch.CheckParam(_active[col.Index], nameof(col.Index), "column is not active"); + var column = _view._columns[col.Index] as Column; if (column == null) throw Ch.Except("Invalid TValue: '{0}'", typeof(TValue)); @@ -255,10 +257,10 @@ public override ValueGetter GetGetter(int col) }; } - public override bool IsColumnActive(int col) + public override bool IsColumnActive(DataViewSchema.Column column) { - Contracts.Check(0 <= col && col < Schema.Count); - return _active[col]; + Contracts.Check(0 <= column.Index && column.Index < Schema.Count); + return _active[column.Index]; } protected override void Dispose(bool disposing) @@ -271,20 +273,19 @@ protected override void Dispose(bool disposing) base.Dispose(disposing); } - public override ValueGetter GetIdGetter() + public override ValueGetter GetIdGetter() { return - (ref RowId val) => + (ref DataViewRowId val) => { Ch.Check(IsGood, "Cannot call ID getter in current state"); long index = Position % BatchSize + _batchId * BatchSize; - val = new RowId((ulong)index, 0); + val = new DataViewRowId((ulong)index, 0); }; } protected override bool MoveNextCore() { - Ch.Assert(State != CursorState.Done); long index = Position % BatchSize + _batchId * BatchSize; Ch.Assert(index < _view._rowCount); if ((Position + 1) % BatchSize == 0 && !_justLoaded) @@ -302,7 +303,7 @@ protected override bool MoveNextCore() return index < _view._rowCount; } - public static RowCursor[] CreateSet(IChannelProvider provider, NativeDataView view, bool[] active, int n, Random rand) + public static DataViewRowCursor[] CreateSet(IChannelProvider provider, NativeDataView view, bool[] active, int n, Random rand) { Contracts.AssertValue(provider); provider.AssertValue(view); @@ -312,10 +313,10 @@ public static RowCursor[] CreateSet(IChannelProvider provider, NativeDataView vi var reader = new TextColumnReader(BatchSize, view._rowCount, n, view._columns); if (n <= 1) { - return new RowCursor[1] { new NativeRowCursor(provider, view, active, rand, reader) }; + return new DataViewRowCursor[1] { new NativeRowCursor(provider, view, active, rand, reader) }; } - var cursors = new RowCursor[n]; + var cursors = new DataViewRowCursor[n]; try { for (int i = 0; i < cursors.Length; i++) @@ -395,7 +396,7 @@ private sealed class TextColumnReader : IDisposable // The reader can be referenced by multiple workers. This is the reference count. private int _cref; private BlockingCollection _queue; - private Thread _thdRead; + private Task _thdRead; private volatile bool _abort; public TextColumnReader(int batchSize, long rowsCount, int cref, Column[] columns) @@ -412,8 +413,7 @@ public TextColumnReader(int batchSize, long rowsCount, int cref, Column[] column _waiterPublish = new OrderedWaiter(firstCleared: true); _queue = new BlockingCollection(QueueSize); - _thdRead = Utils.CreateBackgroundThread(ThreadProc); - _thdRead.Start(); + _thdRead = Utils.RunOnBackgroundThread(ThreadProc); } public void Release() @@ -428,7 +428,7 @@ public void Release() { _abort = true; _waiterPublish.IncrementAll(); - _thdRead.Join(); + _thdRead.Wait(); _thdRead = null; } @@ -470,7 +470,7 @@ private void ThreadProc() long batchId = -1; long total = 0; - var txtColumns = _columns.Where(c => c.DetachedColumn.Type is TextType).ToList(); + var txtColumns = _columns.Where(c => c.DetachedColumn.Type is TextDataViewType).ToList(); int index = 0; var infos = new Row[_batchSize]; @@ -555,13 +555,13 @@ private abstract class Column : IDisposable public readonly int ColIndex; protected const string AlreadyDisposed = "Native wrapped column has been disposed"; - protected Column(DataSourceBlock* data, int colIndex, string name, ColumnType type) + protected Column(DataSourceBlock* data, int colIndex, string name, DataViewType type) { Contracts.AssertNonWhiteSpace(name); Contracts.AssertValue(type); Data = data; ColIndex = colIndex; - DetachedColumn = new Schema.DetachedColumn(name, type); + DetachedColumn = new DataViewSchema.DetachedColumn(name, type); } public virtual void Dispose() @@ -571,12 +571,12 @@ public virtual void Dispose() /// This field contains some duplicate information with . /// For more information please see the remarks on . - public Schema.DetachedColumn DetachedColumn { get; protected set; } + public DataViewSchema.DetachedColumn DetachedColumn { get; protected set; } } private abstract class Column : Column { - protected Column(DataSourceBlock* data, int colIndex, string name, ColumnType type) + protected Column(DataSourceBlock* data, int colIndex, string name, DataViewType type) : base(data, colIndex, name, type) { Contracts.Assert(typeof(TOut) == type.RawType); @@ -593,7 +593,7 @@ private sealed class BoolColumn : Column private BLGetter _getter; public BoolColumn(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, BoolType.Instance) + : base(data, colIndex, name, BooleanDataViewType.Instance) { _getter = MarshalDelegate(getter); } @@ -622,7 +622,7 @@ private sealed class I1Column : Column private I1Getter _getter; public I1Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.I1) + : base(data, colIndex, name, NumberDataViewType.SByte) { _getter = MarshalDelegate(getter); } @@ -647,7 +647,7 @@ private sealed class I2Column : Column private I2Getter _getter; public I2Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.I2) + : base(data, colIndex, name, NumberDataViewType.Int16) { _getter = MarshalDelegate(getter); } @@ -672,7 +672,7 @@ private sealed class I4Column : Column private I4Getter _getter; public I4Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.I4) + : base(data, colIndex, name, NumberDataViewType.Int32) { _getter = MarshalDelegate(getter); } @@ -697,7 +697,7 @@ private sealed class I8Column : Column private I8Getter _getter; public I8Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.I8) + : base(data, colIndex, name, NumberDataViewType.Int64) { _getter = MarshalDelegate(getter); } @@ -724,7 +724,7 @@ private sealed class U1Column : Column private U1Getter _getter; public U1Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.U1) + : base(data, colIndex, name, NumberDataViewType.Byte) { _getter = MarshalDelegate(getter); } @@ -748,7 +748,7 @@ private sealed class U2Column : Column private U2Getter _getter; public U2Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.U2) + : base(data, colIndex, name, NumberDataViewType.UInt16) { _getter = MarshalDelegate(getter); } @@ -772,7 +772,7 @@ private sealed class U4Column : Column private U4Getter _getter; public U4Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.U4) + : base(data, colIndex, name, NumberDataViewType.UInt32) { _getter = MarshalDelegate(getter); } @@ -796,7 +796,7 @@ private sealed class U8Column : Column private U8Getter _getter; public U8Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.U8) + : base(data, colIndex, name, NumberDataViewType.UInt64) { _getter = MarshalDelegate(getter); } @@ -822,7 +822,7 @@ private sealed class R8Column : Column private R8Getter _getter; public R8Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.R8) + : base(data, colIndex, name, NumberDataViewType.Double) { _getter = MarshalDelegate(getter); } @@ -848,7 +848,7 @@ private sealed class R4Column : Column private R4Getter _getter; public R4Column(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, NumberType.R4) + : base(data, colIndex, name, NumberDataViewType.Single) { _getter = MarshalDelegate(getter); } @@ -872,7 +872,7 @@ private sealed class TextColumn : Column> private TXGetter _getter; public TextColumn(DataSourceBlock* data, void* getter, int colIndex, string name) - : base(data, colIndex, name, TextType.Instance) + : base(data, colIndex, name, TextDataViewType.Instance) { _getter = MarshalDelegate(getter); } @@ -912,7 +912,7 @@ private sealed class KeyColumn : Column private U4Getter _getter; public KeyColumn(DataSourceBlock* data, void* getter, int colIndex, string name, int keyCount, ref VBuffer> keyValues) - : base(data, colIndex, name, new KeyType(DataKind.U4, 0, keyCount)) + : base(data, colIndex, name, new KeyDataViewType(typeof(uint), keyCount)) { Contracts.Assert(keyCount >= 0); Contracts.Assert(keyValues.Length == 0 || keyValues.Length == keyCount); @@ -924,10 +924,10 @@ public KeyColumn(DataSourceBlock* data, void* getter, int colIndex, string name, keyValues.CopyTo(ref _keyValues); ValueGetter>> getKeyValues = (ref VBuffer> dst) => _keyValues.CopyTo(ref dst); - var metadataBuilder = new MetadataBuilder(); - metadataBuilder.AddKeyValues(keyCount, TextType.Instance, getKeyValues); - DetachedColumn = new Schema.DetachedColumn( - name, new KeyType(DataKind.U4, 0, keyCount), metadataBuilder.GetMetadata()); + var metadataBuilder = new DataViewSchema.Annotations.Builder(); + metadataBuilder.AddKeyValues(keyCount, TextDataViewType.Instance, getKeyValues); + DetachedColumn = new DataViewSchema.DetachedColumn( + name, new KeyDataViewType(typeof(uint), keyCount), metadataBuilder.ToAnnotations()); } } @@ -950,11 +950,11 @@ private sealed class VectorBoolColumn : Column> private BLVectorGetter _getter; private readonly int _length; - public VectorBoolColumn(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorBoolColumn(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -989,11 +989,11 @@ private sealed class VectorUInt1Column : Column> private U1VectorGetter _getter; private readonly int _length; - public VectorUInt1Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorUInt1Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1028,11 +1028,11 @@ private sealed class VectorUInt2Column : Column> private U2VectorGetter _getter; private readonly int _length; - public VectorUInt2Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorUInt2Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1067,11 +1067,11 @@ private sealed class VectorUInt4Column : Column> private U4VectorGetter _getter; private readonly int _length; - public VectorUInt4Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorUInt4Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1106,11 +1106,11 @@ private sealed class VectorUInt8Column : Column> private U8VectorGetter _getter; private readonly int _length; - public VectorUInt8Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorUInt8Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1145,11 +1145,11 @@ private sealed class VectorInt1Column : Column> private I1VectorGetter _getter; private readonly int _length; - public VectorInt1Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorInt1Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1184,11 +1184,11 @@ private sealed class VectorInt2Column : Column> private I2VectorGetter _getter; private readonly int _length; - public VectorInt2Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorInt2Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1223,11 +1223,11 @@ private sealed class VectorInt4Column : Column> private I4VectorGetter _getter; private readonly int _length; - public VectorInt4Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorInt4Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1262,11 +1262,11 @@ private sealed class VectorInt8Column : Column> private I8VectorGetter _getter; private readonly int _length; - public VectorInt8Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorInt8Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1302,11 +1302,11 @@ private sealed class VectorR4Column : Column> private R4VectorGetter _getter; private readonly int _length; - public VectorR4Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorR4Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) @@ -1341,11 +1341,11 @@ private sealed class VectorR8Column : Column> private R8VectorGetter _getter; private readonly int _length; - public VectorR8Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorType type) + public VectorR8Column(DataSourceBlock* data, void* getter, int colIndex, string name, VectorDataViewType type) : base(data, colIndex, name, type) { _getter = MarshalDelegate(getter); - _length = type.VectorSize; + _length = type.GetVectorSize(); } public override void CopyOut(long index, Batch batch, ref VBuffer dst) diff --git a/src/DotNetBridge/RmlEnvironment.cs b/src/DotNetBridge/RmlEnvironment.cs index dd62da0e..d2e861fe 100644 --- a/src/DotNetBridge/RmlEnvironment.cs +++ b/src/DotNetBridge/RmlEnvironment.cs @@ -6,7 +6,7 @@ using System; using System.Globalization; using Microsoft.ML; -using Microsoft.ML.Data; +using Microsoft.ML.Runtime; namespace Microsoft.MachineLearning.DotNetBridge { @@ -25,12 +25,11 @@ public Channel(RmlEnvironment master, ChannelProviderBase parent, string shortNa private sealed class Host : HostBase { - public Host(HostEnvironmentBase source, string shortName, string parentFullName, Random rand, bool verbose, int? conc) - : base(source, shortName, parentFullName, rand, verbose, conc) + public Host(HostEnvironmentBase source, string shortName, string parentFullName, Random rand, bool verbose) + : base(source, shortName, parentFullName, rand, verbose) { } - public new bool IsCancelled { get { return Root.IsCancelled; } } protected override IChannel CreateCommChannel(ChannelProviderBase parent, string name) { Contracts.AssertValue(parent); @@ -47,47 +46,45 @@ protected override IPipe CreatePipe(ChannelProviderBase pare return new Pipe(parent, name, GetDispatchDelegate()); } - protected override IHost RegisterCore(HostEnvironmentBase source, string shortName, string parentFullName, Random rand, bool verbose, int? conc) + protected override IHost RegisterCore(HostEnvironmentBase source, string shortName, string parentFullName, Random rand, bool verbose) { - return new Host(source, shortName, parentFullName, rand, verbose, conc); + return new Host(source, shortName, parentFullName, rand, verbose); } } - public new bool IsCancelled { get { return CheckCancelled(); } } - - public RmlEnvironment(Bridge.CheckCancelled checkDelegate, int? seed = null, bool verbose = false, int conc = 0) - : this(RandomUtils.Create(seed), verbose, conc) + public RmlEnvironment(Bridge.CheckCancelled checkDelegate, int? seed = null, bool verbose = false) + : this(RandomUtils.Create(seed), verbose) { CheckCancelled = checkDelegate; } - public RmlEnvironment(Random rand, bool verbose = false, int conc = 0) - : base(rand, verbose, conc) + public RmlEnvironment(Random rand, bool verbose = false) + : base(rand, verbose) { CultureInfo.CurrentUICulture = CultureInfo.InvariantCulture; EnsureDispatcher(); } - public RmlEnvironment(RmlEnvironment source, int? seed = null, bool verbose = false, int conc = 0) - : this(source, RandomUtils.Create(seed), verbose, conc) + public RmlEnvironment(RmlEnvironment source, int? seed = null, bool verbose = false) + : this(source, RandomUtils.Create(seed), verbose) { } - public RmlEnvironment(RmlEnvironment source, Random rand, bool verbose = false, int conc = 0) - : base(source, rand, verbose, conc) + public RmlEnvironment(RmlEnvironment source, Random rand, bool verbose = false) + : base(source, rand, verbose) { CultureInfo.CurrentUICulture = CultureInfo.InvariantCulture; EnsureDispatcher(); } - protected override IHost RegisterCore(HostEnvironmentBase source, string shortName, string parentFullName, Random rand, bool verbose, int? conc) + protected override IHost RegisterCore(HostEnvironmentBase source, string shortName, string parentFullName, Random rand, bool verbose) { Contracts.AssertValue(rand); Contracts.AssertValueOrNull(parentFullName); Contracts.AssertNonEmpty(shortName); Contracts.Assert(source == this || source is Host); - return new Host(source, shortName, parentFullName, rand, verbose, conc); + return new Host(source, shortName, parentFullName, rand, verbose); } diff --git a/src/DotNetBridge/RunGraph.cs b/src/DotNetBridge/RunGraph.cs index 63a10e01..09617aa6 100644 --- a/src/DotNetBridge/RunGraph.cs +++ b/src/DotNetBridge/RunGraph.cs @@ -13,10 +13,9 @@ using Microsoft.ML.Data; using Microsoft.ML.Data.IO; using Microsoft.ML.EntryPoints; -using Microsoft.ML.EntryPoints.JsonUtils; using Microsoft.ML.Internal.Utilities; +using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; -using Microsoft.ML.Transforms.FeatureSelection; using Newtonsoft.Json; using Newtonsoft.Json.Linq; @@ -97,7 +96,7 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s int? maxThreadsAllowed = Math.Min(args.parallel > 0 ? args.parallel.Value : penv->maxThreadsAllowed, penv->maxThreadsAllowed); maxThreadsAllowed = penv->maxThreadsAllowed > 0 ? maxThreadsAllowed : args.parallel; - var host = env.Register("RunGraph", args.randomSeed, null, maxThreadsAllowed); + var host = env.Register("RunGraph", args.randomSeed, null); JObject graph; try @@ -146,7 +145,7 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s { var extension = Path.GetExtension(path); if (extension == ".txt") - dv = TextLoader.ReadFile(host, new TextLoader.Arguments(), new MultiFileSource(path)); + dv = TextLoader.LoadFile(host, new TextLoader.Options(), new MultiFileSource(path)); else dv = new BinaryLoader(host, new BinaryLoader.Arguments(), path); @@ -285,7 +284,7 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s private static Dictionary ProcessColumns(ref IDataView view, int maxSlots, IHostEnvironment env) { Dictionary result = null; - List drop = null; + List drop = null; for (int i = 0; i < view.Schema.Count; i++) { if (view.Schema[i].IsHidden) @@ -293,24 +292,24 @@ private static Dictionary ProcessColumns(ref IDataVi var columnName = view.Schema[i].Name; var columnType = view.Schema[i].Type; - if (columnType.IsKnownSizeVector) + if (columnType.IsKnownSizeVector()) { Utils.Add(ref result, columnName, new ColumnMetadataInfo(true, null, null)); - if (maxSlots > 0 && columnType.ValueCount > maxSlots) + if (maxSlots > 0 && columnType.GetValueCount() > maxSlots) { Utils.Add(ref drop, - new SlotsDroppingTransformer.ColumnInfo( - input: columnName, + new SlotsDroppingTransformer.ColumnOptions( + name: columnName, slots: (maxSlots, null))); } } - else if (columnType.IsKey) + else if (columnType is KeyDataViewType) { Dictionary> map = null; - if (columnType.KeyCount > 0 && view.Schema[i].HasKeyValues(columnType.KeyCount)) + if (columnType.GetKeyCount() > 0 && view.Schema[i].HasKeyValues()) { var keyNames = default(VBuffer>); - view.Schema[i].Metadata.GetValue(MetadataUtils.Kinds.KeyValues, ref keyNames); + view.Schema[i].Annotations.GetValue(AnnotationUtils.Kinds.KeyValues, ref keyNames); map = keyNames.Items().ToDictionary(kv => (uint)kv.Key, kv => kv.Value); } Utils.Add(ref result, columnName, new ColumnMetadataInfo(false, null, map)); diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index f68369d9..b9b3ae1a 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -11,14 +11,15 @@ - - - - - - - - + + + + + + + + + diff --git a/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt b/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt index 19139fe9..787972a2 100644 --- a/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt +++ b/src/python/docs/docstrings/FactorizationMachineBinaryClassifier.txt @@ -32,28 +32,6 @@ :param label: see `Columns `_. - :param normalize: Specifies the type of automatic normalization used: - - * ``"Auto"``: if normalization is needed, it is performed - automatically. This is the default choice. - * ``"No"``: no normalization is performed. - * ``"Yes"``: normalization is performed. - * ``"Warn"``: if normalization is needed, a warning - message is displayed, but normalization is not performed. - - Normalization rescales disparate data ranges to a standard scale. - Feature - scaling insures the distances between data points are proportional - and - enables various optimization methods such as gradient descent to - converge - much faster. If normalization is performed, a ``MaxMin`` normalizer - is - used. It normalizes values in an interval [a, b] where ``-1 <= a <= - 0`` - and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves - sparsity by mapping zero to zero. - .. seealso:: :py:func:`LogisticRegressionClassifier `, diff --git a/src/python/docs/docstrings/FastLinearBinaryClassifier.txt b/src/python/docs/docstrings/FastLinearBinaryClassifier.txt index f3a5f3b9..db2c74db 100644 --- a/src/python/docs/docstrings/FastLinearBinaryClassifier.txt +++ b/src/python/docs/docstrings/FastLinearBinaryClassifier.txt @@ -48,7 +48,7 @@ optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets - ``shuffle`` to ``False`` and ``train_threads`` to ``1``. + ``shuffle`` to ``False`` and ``number_of_threads`` to ``1``. **Reference** diff --git a/src/python/docs/docstrings/FastLinearClassifier.txt b/src/python/docs/docstrings/FastLinearClassifier.txt index 6c741d22..2fcb2868 100644 --- a/src/python/docs/docstrings/FastLinearClassifier.txt +++ b/src/python/docs/docstrings/FastLinearClassifier.txt @@ -46,7 +46,7 @@ optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets ``shuffle`` to - ``False`` and ``train_threads`` to ``1``. + ``False`` and ``number_of_threads`` to ``1``. **Reference** diff --git a/src/python/docs/docstrings/FastLinearRegressor.txt b/src/python/docs/docstrings/FastLinearRegressor.txt index a80eb8bc..4dda71be 100644 --- a/src/python/docs/docstrings/FastLinearRegressor.txt +++ b/src/python/docs/docstrings/FastLinearRegressor.txt @@ -46,7 +46,7 @@ optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets ``shuffle`` to - ``False`` and ``train_threads`` to ``1``. + ``False`` and ``number_of_threads`` to ``1``. **Reference** diff --git a/src/python/docs/docstrings/OneHotHashVectorizer.txt b/src/python/docs/docstrings/OneHotHashVectorizer.txt index 96cea74e..40e92f4c 100644 --- a/src/python/docs/docstrings/OneHotHashVectorizer.txt +++ b/src/python/docs/docstrings/OneHotHashVectorizer.txt @@ -33,7 +33,7 @@ For more details see `Columns `_. - :param hash_bits: An integer specifying the number of bits to hash into. + :param number_of_bits: An integer specifying the number of bits to hash into. Must be between 1 and 30, inclusive. The default value is 16. :param random_state: An integer specifying the hashing seed. The default @@ -43,7 +43,7 @@ :param ordered: ``True`` to include the position of each term in the hash. Otherwise, ``False``. The default value is ``True``. - :param invert_hash: An integer specifying the limit on the number of keys + :param maximum_number_of_inverts: An integer specifying the limit on the number of keys that can be used to generate the slot name. ``0`` means no invert hashing; ``-1`` means no limit. While a zero value gives better performance, a non-zero value is needed to get meaningful coefficent diff --git a/src/python/docs/docstrings/PixelExtractor.txt b/src/python/docs/docstrings/PixelExtractor.txt index 64c7d202..55a1b18e 100644 --- a/src/python/docs/docstrings/PixelExtractor.txt +++ b/src/python/docs/docstrings/PixelExtractor.txt @@ -41,7 +41,7 @@ :param use_blue: Specifies whether to use blue channel. The default value is ``True``. - :param interleave_argb: Whether to separate each channel or + :param interleave: Whether to separate each channel or interleave in ARGB order. This might be important, for example, if you are training a convolutional neural network, since this would affect the shape of diff --git a/src/python/docs/docstrings/SsweEmbedding.txt b/src/python/docs/docstrings/SsweEmbedding.txt index 55897d9d..4c476285 100644 --- a/src/python/docs/docstrings/SsweEmbedding.txt +++ b/src/python/docs/docstrings/SsweEmbedding.txt @@ -44,10 +44,9 @@ <'This', 'is', 'good'>, users need to create an input column by: * concatenating columns with TX type, - * or using the ``output_tokens=True`` for ```NGramFeaturizer()`` to + * or using the ``output_tokens_column_name`` for ```NGramFeaturizer()`` to convert a column with sentences like "This is good" into <'This', - 'is', 'good'>. The column for the output token column is renamed with - a prefix of '_TranformedText'. + 'is', 'good'>. In the following example, after the ``NGramFeaturizer``, features named *ngram.__* are generated. diff --git a/src/python/docs/docstrings/WordEmbedding.txt b/src/python/docs/docstrings/WordEmbedding.txt index 3ba1ffe8..41d6f1c6 100644 --- a/src/python/docs/docstrings/WordEmbedding.txt +++ b/src/python/docs/docstrings/WordEmbedding.txt @@ -17,7 +17,7 @@ Available options are: 'GloVe50D', 'GloVe100D', 'GloVe200D', 'GloVe300D', 'GloVeTwitter25D', 'GloVeTwitter50D', 'GloVeTwitter100D', 'GloVeTwitter200D', 'FastTextWikipedia300D', - 'Sswe'. + 'SentimentSpecificWordEmbedding'. :param columns: a dictionary of key-value pairs, where key is the output column name and value is the input column name. @@ -45,10 +45,9 @@ <'This', 'is', 'good'>, users need to create an input column by: * concatenating columns with TX type, - * or using the ``output_tokens=True`` for ``NGramFeaturizer()`` to + * or using the ``output_tokens_column_name`` for ``NGramFeaturizer()`` to convert a column with sentences like "This is good" into <'This', - 'is', 'good'>. The column for the output token column is renamed with - a prefix of '_TranformedText'. + 'is', 'good'>. In the following example, after the ``NGramFeaturizer``, features diff --git a/src/python/docs/sphinx/ci_script/update_all_toc_yml.py b/src/python/docs/sphinx/ci_script/update_all_toc_yml.py index 156d2a22..b5438650 100644 --- a/src/python/docs/sphinx/ci_script/update_all_toc_yml.py +++ b/src/python/docs/sphinx/ci_script/update_all_toc_yml.py @@ -382,10 +382,10 @@ line = line.replace( "[Column Roles for Trainers](roles.md#roles)", "[Column Roles for Trainers](roles.md#roles-and-learners)") - if "[VectorType Columns](types.md#vectortype)" in line: + if "[VectorDataViewType Columns](types.md#vectortype)" in line: line = line.replace( - "[VectorType Columns](types.md#vectortype)", - "[VectorType Columns](types.md#vectortype-columns)") + "[VectorDataViewType Columns](types.md#vectortype)", + "[VectorDataViewType Columns](types.md#vectortype-columns)") if "[Column Operations for Transforms](columns.md#l-pipeline-syntax)" in line: line = line.replace( "[Column Operations for Transforms](columns.md#l-pipeline-syntax)", diff --git a/src/python/docs/sphinx/concepts/columns.rst b/src/python/docs/sphinx/concepts/columns.rst index ca051494..ae549eb0 100644 --- a/src/python/docs/sphinx/concepts/columns.rst +++ b/src/python/docs/sphinx/concepts/columns.rst @@ -28,7 +28,7 @@ Transform All Columns By default, the ``OneHotVectorizer`` transform will process all columns, which in our example results in a the original column values being replaced by their one hot encodings. Note that the -output of ``OneHotVectorizer`` are :ref:`VectorType`, so the output +output of ``OneHotVectorizer`` are :ref:`VectorDataViewType`, so the output names below are the column names appended with the ``slot`` names, which in our example are data driven and generated dynamically from the input data. diff --git a/src/python/docs/sphinx/concepts/datasources.rst b/src/python/docs/sphinx/concepts/datasources.rst index c1fd099d..0a8b1986 100644 --- a/src/python/docs/sphinx/concepts/datasources.rst +++ b/src/python/docs/sphinx/concepts/datasources.rst @@ -126,7 +126,7 @@ are used inside a `sklearn.pipeline.Pipeline or when they are used individually. However, when used inside a :py:class:`nimbusml.Pipeline`, the outputs are often stored in -a more optimized :ref:`VectorType`, which minimizes data conversion to +a more optimized :ref:`VectorDataViewType`, which minimizes data conversion to dataframes. When several transforms are combined inside an :py:class:`nimbusml.Pipeline`, the intermediate transforms will store the data in the optimized format and only the last transform will return a ``pandas.DataFrame``. diff --git a/src/python/docs/sphinx/concepts/roles.rst b/src/python/docs/sphinx/concepts/roles.rst index c76330f4..9873b352 100644 --- a/src/python/docs/sphinx/concepts/roles.rst +++ b/src/python/docs/sphinx/concepts/roles.rst @@ -141,9 +141,9 @@ Below is an example of using GroupId at the trainer. exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], ToKey() << 'group', - LightGbmRanker(min_data_per_leaf = 1) << {Role.Feature: ['workclass', 'education'], Role.Label:'y', Role.GroupId:'group'} - #Equivalent to LightGbmRanker(min_data_per_leaf = 1) << {'Feature': ['workclass', 'education'], 'Label':'y', 'GroupId':'group'} - #Equivalent to LightGbmRanker(min_data_per_leaf = 1, feature = ['workclass', 'education'], label = 'y', group_id = 'group') + LightGbmRanker(minimum_example_count_per_leaf = 1) << {Role.Feature: ['workclass', 'education'], Role.Label:'y', Role.GroupId:'group'} + #Equivalent to LightGbmRanker(minimum_example_count_per_leaf = 1) << {'Feature': ['workclass', 'education'], 'Label':'y', 'GroupId':'group'} + #Equivalent to LightGbmRanker(minimum_example_count_per_leaf = 1, feature = ['workclass', 'education'], label = 'y', group_id = 'group') ]) exp.fit(df) prediction = exp.predict(df) \ No newline at end of file diff --git a/src/python/docs/sphinx/concepts/schema.rst b/src/python/docs/sphinx/concepts/schema.rst index 7c67a999..c7ee5f08 100644 --- a/src/python/docs/sphinx/concepts/schema.rst +++ b/src/python/docs/sphinx/concepts/schema.rst @@ -65,7 +65,7 @@ where * **col=** is specified for every column in the dataset, * **name** is the name of the column, * **position** is the 0-based index (or index range) of the column(s), -* **type** is one of the :ref:`column-types`. When the *position* is a range (i.e. *start_index-end_index*), the column is of :ref:`VectorType`. +* **type** is one of the :ref:`column-types`. When the *position* is a range (i.e. *start_index-end_index*), the column is of :ref:`VectorDataViewType`. * **options** * **header=** [+-] : Specifies if there is a header present in the text file diff --git a/src/python/docs/sphinx/concepts/types.rst b/src/python/docs/sphinx/concepts/types.rst index 32fadb86..21797155 100644 --- a/src/python/docs/sphinx/concepts/types.rst +++ b/src/python/docs/sphinx/concepts/types.rst @@ -28,35 +28,35 @@ labels to be of a numeric type. * **I1, I2, I4, I8** : signed integer types with the indicated number of bytes * **U1, U2, U4, U8, U256** : unsigned integer types with the indicated number of bytes * **U4[100-199]** : A key type based on U4 representing legal values from 100 to 199, inclusive -* **V** A :ref:`VectorType` with item type R4 and dimensionality information [3,2] +* **V** A :ref:`VectorDataViewType` with item type R4 and dimensionality information [3,2] For more details, please refer to `UnmanagedType Enumeration `_. .. _VectorType: -VectorType Columns +VectorDataViewType Columns """""""""""""""""" -A VectorType column contains a vector of values of a homogenous type, and is associated with a +A VectorDataViewType column contains a vector of values of a homogenous type, and is associated with a ``column_name``. The following table shows how NimbusML processes a dataset: .. image:: ../_static/images/table_car.png -The third column is a VectorType column named *Features* with 10 ``slots``. A VectorType column can +The third column is a VectorDataViewType column named *Features* with 10 ``slots``. A VectorDataViewType column can be referenced within a transform (or estimator) by its ``column_name``, such as using *Feature*. But the ``slots`` themselves may also have names which are generated dynamically by the transform during the ``fit()`` method. As the return type of all of the transforms is a ``pandas.DataFrame``, a -VectorType column will be converted. The ``column_name`` of the vector is lost, but the slot names +VectorDataViewType column will be converted. The ``column_name`` of the vector is lost, but the slot names are preserved (and available for viewing). In the above example, the *Features* column may be converted to 10 columns with names *Features.0*, *Features.1*,...,*Features.9* as the output of a transform. However, within a :py:class:`nimbusml.Pipeline` , there is no conversion to a -dataframe and therefore the column_name can still be used to refer to the VectorType column. +dataframe and therefore the column_name can still be used to refer to the VectorDataViewType column. .. note:: - Transforms frequently output VectorType columns. Within an + Transforms frequently output VectorDataViewType columns. Within an :py:class:`nimbusml.Pipeline`, data transfer between transforms is done very efficiently without any conversion to a dataframe. Since the ``column_name`` of the vector is also preserved, it is possible to refer to it by downstream transforms by name. However, when diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 910b76ea..23bcd324 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -117,6 +117,7 @@ + @@ -250,7 +251,7 @@ - + @@ -1095,7 +1096,7 @@ - + \ No newline at end of file diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 931aa288..aa21ec31 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '0.7.0' +__version__ = '1.0.0' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/nimbusml/cluster/kmeansplusplus.py b/src/python/nimbusml/cluster/kmeansplusplus.py index 47b6c5a3..a6cd94ff 100644 --- a/src/python/nimbusml/cluster/kmeansplusplus.py +++ b/src/python/nimbusml/cluster/kmeansplusplus.py @@ -66,19 +66,19 @@ class KMeansPlusPlus(core, BasePredictor, ClusterMixin): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param n_clusters: The number of clusters. - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. - :param init_algorithm: Cluster initialization algorithm. + :param initialization_algorithm: Cluster initialization algorithm. :param opt_tol: Tolerance parameter for trainer convergence. Low = slower, more accurate. - :param max_iterations: Maximum number of iterations. + :param maximum_number_of_iterations: Maximum number of iterations. :param accel_mem_budget_mb: Memory budget (in MBs) to use for KMeans acceleration. @@ -104,35 +104,35 @@ def __init__( normalize='Auto', caching='Auto', n_clusters=5, - train_threads=None, - init_algorithm='KMeansParallel', + number_of_threads=None, + initialization_algorithm='KMeansYinyang', opt_tol=1e-07, - max_iterations=1000, + maximum_number_of_iterations=1000, accel_mem_budget_mb=4096, feature=None, weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'weight_column' in params: + params['feature_column_name'] = feature + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='clusterer', **params) core.__init__( self, normalize=normalize, caching=caching, n_clusters=n_clusters, - train_threads=train_threads, - init_algorithm=init_algorithm, + number_of_threads=number_of_threads, + initialization_algorithm=initialization_algorithm, opt_tol=opt_tol, - max_iterations=max_iterations, + maximum_number_of_iterations=maximum_number_of_iterations, accel_mem_budget_mb=accel_mem_budget_mb, **params) self.feature = feature diff --git a/src/python/nimbusml/datasets/data/gplv2/infert.csv b/src/python/nimbusml/datasets/data/gplv2/infert.csv index 59720748..5fd8d4fb 100644 --- a/src/python/nimbusml/datasets/data/gplv2/infert.csv +++ b/src/python/nimbusml/datasets/data/gplv2/infert.csv @@ -1,249 +1,249 @@ "row_num","education","age","parity","induced","case","spontaneous","stratum","pooled.stratum" -"1","0-5yrs",26,6,1,1,2,1,3 -"2","0-5yrs",42,1,1,1,0,2,1 -"3","0-5yrs",39,6,2,1,0,3,4 -"4","0-5yrs",34,4,2,1,0,4,2 -"5","6-11yrs",35,3,1,1,1,5,32 -"6","6-11yrs",36,4,2,1,1,6,36 -"7","6-11yrs",23,1,0,1,0,7,6 -"8","6-11yrs",32,2,0,1,0,8,22 -"9","6-11yrs",21,1,0,1,1,9,5 -"10","6-11yrs",28,2,0,1,0,10,19 -"11","6-11yrs",29,2,1,1,0,11,20 -"12","6-11yrs",37,4,2,1,1,12,37 -"13","6-11yrs",31,1,1,1,0,13,9 -"14","6-11yrs",29,3,2,1,0,14,29 -"15","6-11yrs",31,2,1,1,1,15,21 -"16","6-11yrs",27,2,2,1,0,16,18 -"17","6-11yrs",30,5,2,1,1,17,38 -"18","6-11yrs",26,1,0,1,1,18,7 -"19","6-11yrs",25,3,2,1,1,19,28 -"20","6-11yrs",44,1,0,1,1,20,17 -"21","6-11yrs",40,1,0,1,1,21,14 -"22","6-11yrs",35,2,2,1,0,22,24 -"23","6-11yrs",28,2,0,1,2,23,19 -"24","6-11yrs",36,1,0,1,1,24,12 -"25","6-11yrs",27,2,1,1,1,25,18 -"26","6-11yrs",40,2,0,1,2,26,27 -"27","6-11yrs",38,2,0,1,2,27,26 -"28","6-11yrs",34,3,0,1,2,28,31 -"29","6-11yrs",28,4,1,1,2,29,34 -"30","6-11yrs",30,4,2,1,0,30,35 -"31","6-11yrs",32,1,0,1,1,31,10 -"32","6-11yrs",34,2,1,1,0,32,23 -"33","6-11yrs",42,1,1,1,0,33,16 -"34","6-11yrs",32,2,0,1,2,34,22 -"35","6-11yrs",39,1,1,1,0,35,13 -"36","6-11yrs",35,2,0,1,2,36,24 -"37","6-11yrs",36,1,0,1,1,37,12 -"38","6-11yrs",34,3,1,1,2,38,31 -"39","6-11yrs",30,3,0,1,0,39,30 -"40","6-11yrs",28,1,0,1,1,40,8 -"41","6-11yrs",39,3,0,1,2,41,33 -"42","6-11yrs",35,1,0,1,0,42,11 -"43","6-11yrs",41,1,0,1,0,43,15 -"44","6-11yrs",37,2,1,1,1,44,25 -"45","12+ yrs",30,1,0,1,0,45,44 -"46","12+ yrs",37,1,1,1,0,46,48 -"47","12+ yrs",28,2,0,1,2,47,51 -"48","12+ yrs",27,4,2,1,0,48,61 -"49","12+ yrs",26,2,2,1,0,49,49 -"50","12+ yrs",38,3,0,1,2,50,60 -"51","12+ yrs",24,3,1,1,2,51,56 -"52","12+ yrs",36,5,1,1,2,52,62 -"53","12+ yrs",27,3,1,1,1,53,57 -"54","12+ yrs",28,1,0,1,1,54,42 -"55","12+ yrs",29,2,0,1,2,55,52 -"56","12+ yrs",36,2,0,1,2,56,55 -"57","12+ yrs",28,2,1,1,0,57,51 -"58","12+ yrs",28,2,0,1,2,58,51 -"59","12+ yrs",28,1,0,1,1,59,42 -"60","12+ yrs",27,2,0,1,2,60,50 -"61","12+ yrs",35,2,0,1,2,61,54 -"62","12+ yrs",25,1,0,1,1,62,41 -"63","12+ yrs",34,1,0,1,1,63,47 -"64","12+ yrs",31,2,0,1,2,64,53 -"65","12+ yrs",26,2,1,1,0,65,49 -"66","12+ yrs",32,1,0,1,1,66,46 -"67","12+ yrs",21,1,0,1,1,67,39 -"68","12+ yrs",28,3,1,1,2,68,58 -"69","12+ yrs",37,3,0,1,2,69,59 -"70","12+ yrs",25,1,1,1,0,70,41 -"71","12+ yrs",32,1,1,1,0,71,46 -"72","12+ yrs",25,1,0,1,1,72,41 -"73","12+ yrs",31,1,0,1,1,73,45 -"74","12+ yrs",38,6,0,1,2,74,63 -"75","12+ yrs",26,2,0,1,2,75,49 -"76","12+ yrs",31,1,0,1,1,76,45 -"77","12+ yrs",31,2,0,1,1,77,53 -"78","12+ yrs",25,1,1,1,0,78,41 -"79","12+ yrs",31,1,0,1,1,79,45 -"80","12+ yrs",34,1,0,1,1,80,47 -"81","12+ yrs",35,2,2,1,0,81,54 -"82","12+ yrs",29,1,0,1,1,82,43 -"83","12+ yrs",23,1,0,1,1,83,40 -"84","0-5yrs",26,6,2,0,0,1,3 -"85","0-5yrs",42,1,0,0,0,2,1 -"86","0-5yrs",39,6,2,0,0,3,4 -"87","0-5yrs",34,4,0,0,1,4,2 -"88","6-11yrs",35,3,2,0,0,5,32 -"89","6-11yrs",36,4,1,0,1,6,36 -"90","6-11yrs",23,1,0,0,0,7,6 -"91","6-11yrs",32,2,2,0,0,8,22 -"92","6-11yrs",21,1,0,0,1,9,5 -"93","6-11yrs",28,2,0,0,1,10,19 -"94","6-11yrs",29,2,0,0,0,11,20 -"95","6-11yrs",37,4,1,0,1,12,37 -"96","6-11yrs",31,1,0,0,0,13,9 -"97","6-11yrs",29,3,0,0,1,14,29 -"98","6-11yrs",31,2,1,0,0,15,21 -"99","6-11yrs",27,2,1,0,0,16,18 -"100","6-11yrs",30,5,0,0,2,17,38 -"101","6-11yrs",26,1,0,0,0,18,7 -"102","6-11yrs",25,3,0,0,1,19,28 -"103","6-11yrs",44,1,0,0,0,20,17 -"104","6-11yrs",40,1,0,0,0,21,14 -"105","6-11yrs",35,2,0,0,0,22,24 -"106","6-11yrs",28,2,0,0,0,23,19 -"107","6-11yrs",36,1,0,0,0,24,12 -"108","6-11yrs",27,2,0,0,1,25,18 -"109","6-11yrs",40,2,0,0,0,26,27 -"110","6-11yrs",38,2,0,0,0,27,26 -"111","6-11yrs",34,3,0,0,0,28,31 -"112","6-11yrs",28,4,0,0,2,29,34 -"113","6-11yrs",30,4,1,0,1,30,35 -"114","6-11yrs",32,1,0,0,0,31,10 -"115","6-11yrs",34,2,1,0,0,32,23 -"116","6-11yrs",42,1,1,0,0,33,16 -"117","6-11yrs",32,2,0,0,0,34,22 -"118","6-11yrs",39,1,0,0,0,35,13 -"119","6-11yrs",35,2,0,0,0,36,24 -"120","6-11yrs",36,1,0,0,0,37,12 -"121","6-11yrs",34,3,2,0,0,38,31 -"122","6-11yrs",30,3,0,0,2,39,30 -"123","6-11yrs",28,1,1,0,0,40,8 -"124","6-11yrs",39,3,1,0,0,41,33 -"125","6-11yrs",35,1,0,0,0,42,11 -"126","6-11yrs",41,1,0,0,0,43,15 -"127","6-11yrs",37,2,0,0,0,44,25 -"128","12+ yrs",30,1,1,0,0,45,44 -"129","12+ yrs",37,1,0,0,0,46,48 -"130","12+ yrs",28,2,1,0,0,47,51 -"131","12+ yrs",27,4,2,0,1,48,61 -"132","12+ yrs",26,2,1,0,0,49,49 -"133","12+ yrs",38,3,1,0,0,50,60 -"134","12+ yrs",24,3,2,0,1,51,56 -"135","12+ yrs",36,5,1,0,1,52,62 -"136","12+ yrs",27,3,1,0,1,53,57 -"137","12+ yrs",28,1,1,0,0,54,42 -"138","12+ yrs",29,2,1,0,0,55,52 -"139","12+ yrs",36,2,1,0,0,56,55 -"140","12+ yrs",28,2,1,0,1,57,51 -"141","12+ yrs",28,2,2,0,0,58,51 -"142","12+ yrs",28,1,1,0,0,59,42 -"143","12+ yrs",27,2,1,0,0,60,50 -"144","12+ yrs",35,2,2,0,0,61,54 -"145","12+ yrs",25,1,1,0,0,62,41 -"146","12+ yrs",34,1,0,0,0,63,47 -"147","12+ yrs",31,2,0,0,0,64,53 -"148","12+ yrs",26,2,0,0,1,65,49 -"149","12+ yrs",32,1,0,0,0,66,46 -"150","12+ yrs",21,1,0,0,1,67,39 -"151","12+ yrs",28,3,2,0,0,68,58 -"152","12+ yrs",37,3,1,0,1,69,59 -"153","12+ yrs",25,1,0,0,0,70,41 -"154","12+ yrs",32,1,1,0,0,71,46 -"155","12+ yrs",25,1,0,0,0,72,41 -"156","12+ yrs",31,1,0,0,1,73,45 -"157","12+ yrs",26,2,0,0,2,75,49 -"158","12+ yrs",31,1,0,0,0,76,45 -"159","12+ yrs",31,2,2,0,0,77,53 -"160","12+ yrs",25,1,0,0,0,78,41 -"161","12+ yrs",31,1,0,0,0,79,45 -"162","12+ yrs",34,1,0,0,0,80,47 -"163","12+ yrs",35,2,0,0,0,81,54 -"164","12+ yrs",29,1,0,0,1,82,43 -"165","12+ yrs",23,1,0,0,1,83,40 -"166","0-5yrs",26,6,2,0,0,1,3 -"167","0-5yrs",42,1,0,0,0,2,1 -"168","0-5yrs",39,6,2,0,0,3,4 -"169","0-5yrs",34,4,0,0,2,4,2 -"170","6-11yrs",35,3,0,0,0,5,32 -"171","6-11yrs",36,4,0,0,2,6,36 -"172","6-11yrs",23,1,0,0,0,7,6 -"173","6-11yrs",32,2,0,0,1,8,22 -"174","6-11yrs",21,1,1,0,0,9,5 -"175","6-11yrs",28,2,0,0,1,10,19 -"176","6-11yrs",29,2,0,0,1,11,20 -"177","6-11yrs",37,4,0,0,1,12,37 -"178","6-11yrs",31,1,0,0,0,13,9 -"179","6-11yrs",29,3,0,0,2,14,29 -"180","6-11yrs",31,2,1,0,0,15,21 -"181","6-11yrs",27,2,0,0,0,16,18 -"182","6-11yrs",30,5,1,0,2,17,38 -"183","6-11yrs",26,1,1,0,0,18,7 -"184","6-11yrs",25,3,1,0,1,19,28 -"185","6-11yrs",44,1,1,0,0,20,17 -"186","6-11yrs",40,1,0,0,0,21,14 -"187","6-11yrs",35,2,0,0,0,22,24 -"188","6-11yrs",28,2,2,0,0,23,19 -"189","6-11yrs",36,1,0,0,1,24,12 -"190","6-11yrs",27,2,0,0,2,25,18 -"191","6-11yrs",40,2,0,0,0,26,27 -"192","6-11yrs",38,2,0,0,0,27,26 -"193","6-11yrs",34,3,0,0,0,28,31 -"194","6-11yrs",28,4,2,0,1,29,34 -"195","6-11yrs",30,4,1,0,1,30,35 -"196","6-11yrs",32,1,0,0,0,31,10 -"197","6-11yrs",34,2,0,0,0,32,23 -"198","6-11yrs",42,1,0,0,0,33,16 -"199","6-11yrs",32,2,2,0,0,34,22 -"200","6-11yrs",39,1,0,0,0,35,13 -"201","6-11yrs",35,2,0,0,0,36,24 -"202","6-11yrs",36,1,0,0,0,37,12 -"203","6-11yrs",34,3,2,0,0,38,31 -"204","6-11yrs",30,3,0,0,1,39,30 -"205","6-11yrs",28,1,0,0,0,40,8 -"206","6-11yrs",39,3,0,0,0,41,33 -"207","6-11yrs",35,1,0,0,0,42,11 -"208","6-11yrs",41,1,0,0,0,43,15 -"209","6-11yrs",37,2,0,0,0,44,25 -"210","12+ yrs",30,1,0,0,0,45,44 -"211","12+ yrs",37,1,0,0,1,46,48 -"212","12+ yrs",28,2,1,0,0,47,51 -"213","12+ yrs",27,4,2,0,0,48,61 -"214","12+ yrs",26,2,1,0,0,49,49 -"215","12+ yrs",38,3,1,0,0,50,60 -"216","12+ yrs",24,3,2,0,0,51,56 -"217","12+ yrs",36,5,2,0,1,52,62 -"218","12+ yrs",27,3,2,0,0,53,57 -"219","12+ yrs",28,1,0,0,1,54,42 -"220","12+ yrs",29,2,1,0,1,55,52 -"221","12+ yrs",36,2,0,0,1,56,55 -"222","12+ yrs",28,2,2,0,0,57,51 -"223","12+ yrs",28,2,1,0,0,58,51 -"224","12+ yrs",28,1,0,0,0,59,42 -"225","12+ yrs",27,2,1,0,0,60,50 -"226","12+ yrs",35,2,1,0,0,61,54 -"227","12+ yrs",25,1,1,0,0,62,41 -"228","12+ yrs",34,1,0,0,0,63,47 -"229","12+ yrs",31,2,1,0,0,64,53 -"230","12+ yrs",26,2,0,0,2,65,49 -"231","12+ yrs",32,1,1,0,0,66,46 -"232","12+ yrs",21,1,0,0,0,67,39 -"233","12+ yrs",28,3,2,0,0,68,58 -"234","12+ yrs",37,3,0,0,2,69,59 -"235","12+ yrs",25,1,1,0,0,70,41 -"236","12+ yrs",32,1,0,0,0,71,46 -"237","12+ yrs",25,1,1,0,0,72,41 -"238","12+ yrs",31,1,0,0,0,73,45 -"239","12+ yrs",38,6,0,0,2,74,63 -"240","12+ yrs",26,2,1,0,1,75,49 -"241","12+ yrs",31,1,1,0,0,76,45 -"242","12+ yrs",31,2,0,0,1,77,53 -"243","12+ yrs",25,1,0,0,1,78,41 -"244","12+ yrs",31,1,0,0,1,79,45 -"245","12+ yrs",34,1,0,0,0,80,47 -"246","12+ yrs",35,2,2,0,0,81,54 -"247","12+ yrs",29,1,0,0,1,82,43 -"248","12+ yrs",23,1,0,0,1,83,40 +1,"0-5yrs",26,6,1,1,2,1,3 +2,"0-5yrs",42,1,1,1,0,2,1 +3,"0-5yrs",39,6,2,1,0,3,4 +4,"0-5yrs",34,4,2,1,0,4,2 +5,"6-11yrs",35,3,1,1,1,5,32 +6,"6-11yrs",36,4,2,1,1,6,36 +7,"6-11yrs",23,1,0,1,0,7,6 +8,"6-11yrs",32,2,0,1,0,8,22 +9,"6-11yrs",21,1,0,1,1,9,5 +10,"6-11yrs",28,2,0,1,0,10,19 +11,"6-11yrs",29,2,1,1,0,11,20 +12,"6-11yrs",37,4,2,1,1,12,37 +13,"6-11yrs",31,1,1,1,0,13,9 +14,"6-11yrs",29,3,2,1,0,14,29 +15,"6-11yrs",31,2,1,1,1,15,21 +16,"6-11yrs",27,2,2,1,0,16,18 +17,"6-11yrs",30,5,2,1,1,17,38 +18,"6-11yrs",26,1,0,1,1,18,7 +19,"6-11yrs",25,3,2,1,1,19,28 +20,"6-11yrs",44,1,0,1,1,20,17 +21,"6-11yrs",40,1,0,1,1,21,14 +22,"6-11yrs",35,2,2,1,0,22,24 +23,"6-11yrs",28,2,0,1,2,23,19 +24,"6-11yrs",36,1,0,1,1,24,12 +25,"6-11yrs",27,2,1,1,1,25,18 +26,"6-11yrs",40,2,0,1,2,26,27 +27,"6-11yrs",38,2,0,1,2,27,26 +28,"6-11yrs",34,3,0,1,2,28,31 +29,"6-11yrs",28,4,1,1,2,29,34 +30,"6-11yrs",30,4,2,1,0,30,35 +31,"6-11yrs",32,1,0,1,1,31,10 +32,"6-11yrs",34,2,1,1,0,32,23 +33,"6-11yrs",42,1,1,1,0,33,16 +34,"6-11yrs",32,2,0,1,2,34,22 +35,"6-11yrs",39,1,1,1,0,35,13 +36,"6-11yrs",35,2,0,1,2,36,24 +37,"6-11yrs",36,1,0,1,1,37,12 +38,"6-11yrs",34,3,1,1,2,38,31 +39,"6-11yrs",30,3,0,1,0,39,30 +40,"6-11yrs",28,1,0,1,1,40,8 +41,"6-11yrs",39,3,0,1,2,41,33 +42,"6-11yrs",35,1,0,1,0,42,11 +43,"6-11yrs",41,1,0,1,0,43,15 +44,"6-11yrs",37,2,1,1,1,44,25 +45,"12+ yrs",30,1,0,1,0,45,44 +46,"12+ yrs",37,1,1,1,0,46,48 +47,"12+ yrs",28,2,0,1,2,47,51 +48,"12+ yrs",27,4,2,1,0,48,61 +49,"12+ yrs",26,2,2,1,0,49,49 +50,"12+ yrs",38,3,0,1,2,50,60 +51,"12+ yrs",24,3,1,1,2,51,56 +52,"12+ yrs",36,5,1,1,2,52,62 +53,"12+ yrs",27,3,1,1,1,53,57 +54,"12+ yrs",28,1,0,1,1,54,42 +55,"12+ yrs",29,2,0,1,2,55,52 +56,"12+ yrs",36,2,0,1,2,56,55 +57,"12+ yrs",28,2,1,1,0,57,51 +58,"12+ yrs",28,2,0,1,2,58,51 +59,"12+ yrs",28,1,0,1,1,59,42 +60,"12+ yrs",27,2,0,1,2,60,50 +61,"12+ yrs",35,2,0,1,2,61,54 +62,"12+ yrs",25,1,0,1,1,62,41 +63,"12+ yrs",34,1,0,1,1,63,47 +64,"12+ yrs",31,2,0,1,2,64,53 +65,"12+ yrs",26,2,1,1,0,65,49 +66,"12+ yrs",32,1,0,1,1,66,46 +67,"12+ yrs",21,1,0,1,1,67,39 +68,"12+ yrs",28,3,1,1,2,68,58 +69,"12+ yrs",37,3,0,1,2,69,59 +70,"12+ yrs",25,1,1,1,0,70,41 +71,"12+ yrs",32,1,1,1,0,71,46 +72,"12+ yrs",25,1,0,1,1,72,41 +73,"12+ yrs",31,1,0,1,1,73,45 +74,"12+ yrs",38,6,0,1,2,74,63 +75,"12+ yrs",26,2,0,1,2,75,49 +76,"12+ yrs",31,1,0,1,1,76,45 +77,"12+ yrs",31,2,0,1,1,77,53 +78,"12+ yrs",25,1,1,1,0,78,41 +79,"12+ yrs",31,1,0,1,1,79,45 +80,"12+ yrs",34,1,0,1,1,80,47 +81,"12+ yrs",35,2,2,1,0,81,54 +82,"12+ yrs",29,1,0,1,1,82,43 +83,"12+ yrs",23,1,0,1,1,83,40 +84,"0-5yrs",26,6,2,0,0,1,3 +85,"0-5yrs",42,1,0,0,0,2,1 +86,"0-5yrs",39,6,2,0,0,3,4 +87,"0-5yrs",34,4,0,0,1,4,2 +88,"6-11yrs",35,3,2,0,0,5,32 +89,"6-11yrs",36,4,1,0,1,6,36 +90,"6-11yrs",23,1,0,0,0,7,6 +91,"6-11yrs",32,2,2,0,0,8,22 +92,"6-11yrs",21,1,0,0,1,9,5 +93,"6-11yrs",28,2,0,0,1,10,19 +94,"6-11yrs",29,2,0,0,0,11,20 +95,"6-11yrs",37,4,1,0,1,12,37 +96,"6-11yrs",31,1,0,0,0,13,9 +97,"6-11yrs",29,3,0,0,1,14,29 +98,"6-11yrs",31,2,1,0,0,15,21 +99,"6-11yrs",27,2,1,0,0,16,18 +100,"6-11yrs",30,5,0,0,2,17,38 +101,"6-11yrs",26,1,0,0,0,18,7 +102,"6-11yrs",25,3,0,0,1,19,28 +103,"6-11yrs",44,1,0,0,0,20,17 +104,"6-11yrs",40,1,0,0,0,21,14 +105,"6-11yrs",35,2,0,0,0,22,24 +106,"6-11yrs",28,2,0,0,0,23,19 +107,"6-11yrs",36,1,0,0,0,24,12 +108,"6-11yrs",27,2,0,0,1,25,18 +109,"6-11yrs",40,2,0,0,0,26,27 +110,"6-11yrs",38,2,0,0,0,27,26 +111,"6-11yrs",34,3,0,0,0,28,31 +112,"6-11yrs",28,4,0,0,2,29,34 +113,"6-11yrs",30,4,1,0,1,30,35 +114,"6-11yrs",32,1,0,0,0,31,10 +115,"6-11yrs",34,2,1,0,0,32,23 +116,"6-11yrs",42,1,1,0,0,33,16 +117,"6-11yrs",32,2,0,0,0,34,22 +118,"6-11yrs",39,1,0,0,0,35,13 +119,"6-11yrs",35,2,0,0,0,36,24 +120,"6-11yrs",36,1,0,0,0,37,12 +121,"6-11yrs",34,3,2,0,0,38,31 +122,"6-11yrs",30,3,0,0,2,39,30 +123,"6-11yrs",28,1,1,0,0,40,8 +124,"6-11yrs",39,3,1,0,0,41,33 +125,"6-11yrs",35,1,0,0,0,42,11 +126,"6-11yrs",41,1,0,0,0,43,15 +127,"6-11yrs",37,2,0,0,0,44,25 +128,"12+ yrs",30,1,1,0,0,45,44 +129,"12+ yrs",37,1,0,0,0,46,48 +130,"12+ yrs",28,2,1,0,0,47,51 +131,"12+ yrs",27,4,2,0,1,48,61 +132,"12+ yrs",26,2,1,0,0,49,49 +133,"12+ yrs",38,3,1,0,0,50,60 +134,"12+ yrs",24,3,2,0,1,51,56 +135,"12+ yrs",36,5,1,0,1,52,62 +136,"12+ yrs",27,3,1,0,1,53,57 +137,"12+ yrs",28,1,1,0,0,54,42 +138,"12+ yrs",29,2,1,0,0,55,52 +139,"12+ yrs",36,2,1,0,0,56,55 +140,"12+ yrs",28,2,1,0,1,57,51 +141,"12+ yrs",28,2,2,0,0,58,51 +142,"12+ yrs",28,1,1,0,0,59,42 +143,"12+ yrs",27,2,1,0,0,60,50 +144,"12+ yrs",35,2,2,0,0,61,54 +145,"12+ yrs",25,1,1,0,0,62,41 +146,"12+ yrs",34,1,0,0,0,63,47 +147,"12+ yrs",31,2,0,0,0,64,53 +148,"12+ yrs",26,2,0,0,1,65,49 +149,"12+ yrs",32,1,0,0,0,66,46 +150,"12+ yrs",21,1,0,0,1,67,39 +151,"12+ yrs",28,3,2,0,0,68,58 +152,"12+ yrs",37,3,1,0,1,69,59 +153,"12+ yrs",25,1,0,0,0,70,41 +154,"12+ yrs",32,1,1,0,0,71,46 +155,"12+ yrs",25,1,0,0,0,72,41 +156,"12+ yrs",31,1,0,0,1,73,45 +157,"12+ yrs",26,2,0,0,2,75,49 +158,"12+ yrs",31,1,0,0,0,76,45 +159,"12+ yrs",31,2,2,0,0,77,53 +160,"12+ yrs",25,1,0,0,0,78,41 +161,"12+ yrs",31,1,0,0,0,79,45 +162,"12+ yrs",34,1,0,0,0,80,47 +163,"12+ yrs",35,2,0,0,0,81,54 +164,"12+ yrs",29,1,0,0,1,82,43 +165,"12+ yrs",23,1,0,0,1,83,40 +166,"0-5yrs",26,6,2,0,0,1,3 +167,"0-5yrs",42,1,0,0,0,2,1 +168,"0-5yrs",39,6,2,0,0,3,4 +169,"0-5yrs",34,4,0,0,2,4,2 +170,"6-11yrs",35,3,0,0,0,5,32 +171,"6-11yrs",36,4,0,0,2,6,36 +172,"6-11yrs",23,1,0,0,0,7,6 +173,"6-11yrs",32,2,0,0,1,8,22 +174,"6-11yrs",21,1,1,0,0,9,5 +175,"6-11yrs",28,2,0,0,1,10,19 +176,"6-11yrs",29,2,0,0,1,11,20 +177,"6-11yrs",37,4,0,0,1,12,37 +178,"6-11yrs",31,1,0,0,0,13,9 +179,"6-11yrs",29,3,0,0,2,14,29 +180,"6-11yrs",31,2,1,0,0,15,21 +181,"6-11yrs",27,2,0,0,0,16,18 +182,"6-11yrs",30,5,1,0,2,17,38 +183,"6-11yrs",26,1,1,0,0,18,7 +184,"6-11yrs",25,3,1,0,1,19,28 +185,"6-11yrs",44,1,1,0,0,20,17 +186,"6-11yrs",40,1,0,0,0,21,14 +187,"6-11yrs",35,2,0,0,0,22,24 +188,"6-11yrs",28,2,2,0,0,23,19 +189,"6-11yrs",36,1,0,0,1,24,12 +190,"6-11yrs",27,2,0,0,2,25,18 +191,"6-11yrs",40,2,0,0,0,26,27 +192,"6-11yrs",38,2,0,0,0,27,26 +193,"6-11yrs",34,3,0,0,0,28,31 +194,"6-11yrs",28,4,2,0,1,29,34 +195,"6-11yrs",30,4,1,0,1,30,35 +196,"6-11yrs",32,1,0,0,0,31,10 +197,"6-11yrs",34,2,0,0,0,32,23 +198,"6-11yrs",42,1,0,0,0,33,16 +199,"6-11yrs",32,2,2,0,0,34,22 +200,"6-11yrs",39,1,0,0,0,35,13 +201,"6-11yrs",35,2,0,0,0,36,24 +202,"6-11yrs",36,1,0,0,0,37,12 +203,"6-11yrs",34,3,2,0,0,38,31 +204,"6-11yrs",30,3,0,0,1,39,30 +205,"6-11yrs",28,1,0,0,0,40,8 +206,"6-11yrs",39,3,0,0,0,41,33 +207,"6-11yrs",35,1,0,0,0,42,11 +208,"6-11yrs",41,1,0,0,0,43,15 +209,"6-11yrs",37,2,0,0,0,44,25 +210,"12+ yrs",30,1,0,0,0,45,44 +211,"12+ yrs",37,1,0,0,1,46,48 +212,"12+ yrs",28,2,1,0,0,47,51 +213,"12+ yrs",27,4,2,0,0,48,61 +214,"12+ yrs",26,2,1,0,0,49,49 +215,"12+ yrs",38,3,1,0,0,50,60 +216,"12+ yrs",24,3,2,0,0,51,56 +217,"12+ yrs",36,5,2,0,1,52,62 +218,"12+ yrs",27,3,2,0,0,53,57 +219,"12+ yrs",28,1,0,0,1,54,42 +220,"12+ yrs",29,2,1,0,1,55,52 +221,"12+ yrs",36,2,0,0,1,56,55 +222,"12+ yrs",28,2,2,0,0,57,51 +223,"12+ yrs",28,2,1,0,0,58,51 +224,"12+ yrs",28,1,0,0,0,59,42 +225,"12+ yrs",27,2,1,0,0,60,50 +226,"12+ yrs",35,2,1,0,0,61,54 +227,"12+ yrs",25,1,1,0,0,62,41 +228,"12+ yrs",34,1,0,0,0,63,47 +229,"12+ yrs",31,2,1,0,0,64,53 +230,"12+ yrs",26,2,0,0,2,65,49 +231,"12+ yrs",32,1,1,0,0,66,46 +232,"12+ yrs",21,1,0,0,0,67,39 +233,"12+ yrs",28,3,2,0,0,68,58 +234,"12+ yrs",37,3,0,0,2,69,59 +235,"12+ yrs",25,1,1,0,0,70,41 +236,"12+ yrs",32,1,0,0,0,71,46 +237,"12+ yrs",25,1,1,0,0,72,41 +238,"12+ yrs",31,1,0,0,0,73,45 +239,"12+ yrs",38,6,0,0,2,74,63 +240,"12+ yrs",26,2,1,0,1,75,49 +241,"12+ yrs",31,1,1,0,0,76,45 +242,"12+ yrs",31,2,0,0,1,77,53 +243,"12+ yrs",25,1,0,0,1,78,41 +244,"12+ yrs",31,1,0,0,1,79,45 +245,"12+ yrs",34,1,0,0,0,80,47 +246,"12+ yrs",35,2,2,0,0,81,54 +247,"12+ yrs",29,1,0,0,1,82,43 +248,"12+ yrs",23,1,0,0,1,83,40 diff --git a/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py b/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py index 7382dd10..fd3d75a2 100644 --- a/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py +++ b/src/python/nimbusml/decomposition/factorizationmachinebinaryclassifier.py @@ -54,42 +54,32 @@ class FactorizationMachineBinaryClassifier( :param label: see `Columns `_. - :param learning_rate: Initial learning rate. + :param weight: see `Columns `_. - :param iters: Number of training iterations. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. - :param latent_dim: Latent space dimension. + :param number_of_iterations: Number of training iterations. + + :param latent_dimension: Latent space dimension. :param lambda_linear: Regularization coefficient of linear weights. :param lambda_latent: Regularization coefficient of latent weights. - :param normalize: Specifies the type of automatic normalization used: - - * ``"Auto"``: if normalization is needed, it is performed - automatically. This is the default choice. - * ``"No"``: no normalization is performed. - * ``"Yes"``: normalization is performed. - * ``"Warn"``: if normalization is needed, a warning - message is displayed, but normalization is not performed. - - Normalization rescales disparate data ranges to a standard scale. - Feature - scaling insures the distances between data points are proportional - and - enables various optimization methods such as gradient descent to - converge - much faster. If normalization is performed, a ``MaxMin`` normalizer - is - used. It normalizes values in an interval [a, b] where ``-1 <= a <= - 0`` - and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves - sparsity by mapping zero to zero. - - :param norm: Whether to normalize the input vectors so that the + :param normalize: Whether to normalize the input vectors so that the concatenation of all fields' feature vectors is unit-length. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. + + :param extra_feature_columns: Extra columns to use for feature vectors. The + i-th specified string denotes the column containing features form the + (i+1)-th field. Note that the first field is specified by "feat" + instead of "exfeat". :param shuffle: Whether to shuffle for each training iteration. @@ -119,47 +109,54 @@ class FactorizationMachineBinaryClassifier( def __init__( self, learning_rate=0.1, - iters=5, - latent_dim=20, + number_of_iterations=5, + latent_dimension=20, lambda_linear=0.0001, lambda_latent=0.0001, - normalize='Auto', - norm=True, + normalize=True, caching='Auto', + extra_feature_columns=None, shuffle=True, verbose=True, radius=0.5, feature=None, label=None, + weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label + if 'example_weight_column_name' in params: + raise NameError( + "'example_weight_column_name' must be renamed to 'weight'") + if weight: + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, learning_rate=learning_rate, - iters=iters, - latent_dim=latent_dim, + number_of_iterations=number_of_iterations, + latent_dimension=latent_dimension, lambda_linear=lambda_linear, lambda_latent=lambda_latent, normalize=normalize, - norm=norm, caching=caching, + extra_feature_columns=extra_feature_columns, shuffle=shuffle, verbose=verbose, radius=radius, **params) self.feature = feature self.label = label + self.weight = weight @trace def predict_proba(self, X, **params): diff --git a/src/python/nimbusml/decomposition/pcaanomalydetector.py b/src/python/nimbusml/decomposition/pcaanomalydetector.py index 57b21b90..bdf42b22 100644 --- a/src/python/nimbusml/decomposition/pcaanomalydetector.py +++ b/src/python/nimbusml/decomposition/pcaanomalydetector.py @@ -92,7 +92,7 @@ class PcaAnomalyDetector(core, BasePredictor, ClassifierMixin): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param rank: The number of components in the PCA. @@ -128,16 +128,16 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'weight_column' in params: + params['feature_column_name'] = feature + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='anomaly', **params) core.__init__( self, diff --git a/src/python/nimbusml/decomposition/pcatransformer.py b/src/python/nimbusml/decomposition/pcatransformer.py index 7ddb6326..5ef167e3 100644 --- a/src/python/nimbusml/decomposition/pcatransformer.py +++ b/src/python/nimbusml/decomposition/pcatransformer.py @@ -89,11 +89,11 @@ def __init__( columns=None, **params): - if 'weight_column' in params: + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight if columns: params['columns'] = columns BaseTransform.__init__(self, **params) diff --git a/src/python/nimbusml/ensemble/booster/dart.py b/src/python/nimbusml/ensemble/booster/dart.py index a4536a2e..33dc8295 100644 --- a/src/python/nimbusml/ensemble/booster/dart.py +++ b/src/python/nimbusml/ensemble/booster/dart.py @@ -35,53 +35,51 @@ class Dart(core): `_ - :param drop_rate: Drop ratio for trees. Range:(0,1). + :param tree_drop_fraction: The drop ratio for trees. Range:(0,1). - :param max_drop: Max number of dropped tree in a boosting round. + :param maximum_number_of_dropped_trees_per_round: Maximum number of dropped + trees in a boosting round. - :param skip_drop: Probability for not perform dropping in a boosting round. + :param skip_drop_fraction: Probability for not dropping in a boosting + round. :param xgboost_dart_mode: True will enable xgboost dart mode. :param uniform_drop: True will enable uniform drop. - :param unbalanced_sets: Use for binary classification when classes are not - balanced. + :param minimum_split_gain: Minimum loss reduction required to make a + further partition on a leaf node of the tree. the larger, the more + conservative the algorithm will be. - :param min_split_gain: Minimum loss reduction required to make a further - partition on a leaf node of the tree. the larger, the more conservative - the algorithm will be. + :param maximum_tree_depth: Maximum depth of a tree. 0 means no limit. + However, tree still grows by best-first. - :param max_depth: Maximum depth of a tree. 0 means no limit. However, tree - still grows by best-first. - - :param min_child_weight: Minimum sum of instance weight(hessian) needed in - a child. If the tree partition step results in a leaf node with the sum - of instance weight less than min_child_weight, then the building + :param minimum_child_weight: Minimum sum of instance weight(hessian) needed + in a child. If the tree partition step results in a leaf node with the + sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. - :param subsample_freq: Subsample frequency. 0 means no subsample. If - subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And - the subset will be updated on every Subsample iteratinos. + :param subsample_frequency: Subsample frequency for bagging. 0 means no + subsample. Specifies the frequency at which the bagging occurs, where + if this is set to N, the subsampling will happen at every N + iterations.This must be set with Subsample as this specifies the amount + to subsample. - :param subsample: Subsample ratio of the training instance. Setting it to - 0.5 means that LightGBM randomly collected half of the data instances - to grow trees and this will prevent overfitting. Range: (0,1]. + :param subsample_fraction: Subsample ratio of the training instance. + Setting it to 0.5 means that LightGBM randomly collected half of the + data instances to grow trees and this will prevent overfitting. Range: + (0,1]. :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. - :param reg_lambda: L2 regularization term on weights, increasing this value - will make model more conservative. - - :param reg_alpha: L1 regularization term on weights, increase this value - will make model more conservative. + :param l2_regularization: L2 regularization term on weights, increasing + this value will make model more conservative. - :param scale_pos_weight: Control the balance of positive and negative - weights, useful for unbalanced classes. A typical value to consider: - sum(negative cases) / sum(positive cases). + :param l1_regularization: L1 regularization term on weights, increase this + value will make model more conservative. :param params: Additional arguments sent to compute engine. @@ -104,39 +102,35 @@ class Dart(core): @trace def __init__( self, - drop_rate=0.1, - max_drop=1, - skip_drop=0.5, + tree_drop_fraction=0.1, + maximum_number_of_dropped_trees_per_round=1, + skip_drop_fraction=0.5, xgboost_dart_mode=False, uniform_drop=False, - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): core.__init__( self, - drop_rate=drop_rate, - max_drop=max_drop, - skip_drop=skip_drop, + tree_drop_fraction=tree_drop_fraction, + maximum_number_of_dropped_trees_per_round=maximum_number_of_dropped_trees_per_round, + skip_drop_fraction=skip_drop_fraction, xgboost_dart_mode=xgboost_dart_mode, uniform_drop=uniform_drop, - unbalanced_sets=unbalanced_sets, - min_split_gain=min_split_gain, - max_depth=max_depth, - min_child_weight=min_child_weight, - subsample_freq=subsample_freq, - subsample=subsample, + minimum_split_gain=minimum_split_gain, + maximum_tree_depth=maximum_tree_depth, + minimum_child_weight=minimum_child_weight, + subsample_frequency=subsample_frequency, + subsample_fraction=subsample_fraction, feature_fraction=feature_fraction, - reg_lambda=reg_lambda, - reg_alpha=reg_alpha, - scale_pos_weight=scale_pos_weight, + l2_regularization=l2_regularization, + l1_regularization=l1_regularization, **params) def get_params(self, deep=False): diff --git a/src/python/nimbusml/ensemble/booster/gbdt.py b/src/python/nimbusml/ensemble/booster/gbdt.py index ba69c9e2..49427e18 100644 --- a/src/python/nimbusml/ensemble/booster/gbdt.py +++ b/src/python/nimbusml/ensemble/booster/gbdt.py @@ -19,43 +19,39 @@ class Gbdt(core): Traditional Gradient Boosting Decision Tree. - :param unbalanced_sets: Use for binary classification when classes are not - balanced. + :param minimum_split_gain: Minimum loss reduction required to make a + further partition on a leaf node of the tree. the larger, the more + conservative the algorithm will be. - :param min_split_gain: Minimum loss reduction required to make a further - partition on a leaf node of the tree. the larger, the more conservative - the algorithm will be. + :param maximum_tree_depth: Maximum depth of a tree. 0 means no limit. + However, tree still grows by best-first. - :param max_depth: Maximum depth of a tree. 0 means no limit. However, tree - still grows by best-first. - - :param min_child_weight: Minimum sum of instance weight(hessian) needed in - a child. If the tree partition step results in a leaf node with the sum - of instance weight less than min_child_weight, then the building + :param minimum_child_weight: Minimum sum of instance weight(hessian) needed + in a child. If the tree partition step results in a leaf node with the + sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. - :param subsample_freq: Subsample frequency. 0 means no subsample. If - subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And - the subset will be updated on every Subsample iteratinos. + :param subsample_frequency: Subsample frequency for bagging. 0 means no + subsample. Specifies the frequency at which the bagging occurs, where + if this is set to N, the subsampling will happen at every N + iterations.This must be set with Subsample as this specifies the amount + to subsample. - :param subsample: Subsample ratio of the training instance. Setting it to - 0.5 means that LightGBM randomly collected half of the data instances - to grow trees and this will prevent overfitting. Range: (0,1]. + :param subsample_fraction: Subsample ratio of the training instance. + Setting it to 0.5 means that LightGBM randomly collected half of the + data instances to grow trees and this will prevent overfitting. Range: + (0,1]. :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. - :param reg_lambda: L2 regularization term on weights, increasing this value - will make model more conservative. - - :param reg_alpha: L1 regularization term on weights, increase this value - will make model more conservative. + :param l2_regularization: L2 regularization term on weights, increasing + this value will make model more conservative. - :param scale_pos_weight: Control the balance of positive and negative - weights, useful for unbalanced classes. A typical value to consider: - sum(negative cases) / sum(positive cases). + :param l1_regularization: L1 regularization term on weights, increase this + value will make model more conservative. :param params: Additional arguments sent to compute engine. @@ -78,29 +74,25 @@ class Gbdt(core): @trace def __init__( self, - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): core.__init__( self, - unbalanced_sets=unbalanced_sets, - min_split_gain=min_split_gain, - max_depth=max_depth, - min_child_weight=min_child_weight, - subsample_freq=subsample_freq, - subsample=subsample, + minimum_split_gain=minimum_split_gain, + maximum_tree_depth=maximum_tree_depth, + minimum_child_weight=minimum_child_weight, + subsample_frequency=subsample_frequency, + subsample_fraction=subsample_fraction, feature_fraction=feature_fraction, - reg_lambda=reg_lambda, - reg_alpha=reg_alpha, - scale_pos_weight=scale_pos_weight, + l2_regularization=l2_regularization, + l1_regularization=l1_regularization, **params) def get_params(self, deep=False): diff --git a/src/python/nimbusml/ensemble/booster/goss.py b/src/python/nimbusml/ensemble/booster/goss.py index 64863766..8e57181b 100644 --- a/src/python/nimbusml/ensemble/booster/goss.py +++ b/src/python/nimbusml/ensemble/booster/goss.py @@ -40,43 +40,39 @@ class Goss(core): :param other_rate: Retain ratio for small gradient instances. - :param unbalanced_sets: Use for binary classification when classes are not - balanced. + :param minimum_split_gain: Minimum loss reduction required to make a + further partition on a leaf node of the tree. the larger, the more + conservative the algorithm will be. - :param min_split_gain: Minimum loss reduction required to make a further - partition on a leaf node of the tree. the larger, the more conservative - the algorithm will be. + :param maximum_tree_depth: Maximum depth of a tree. 0 means no limit. + However, tree still grows by best-first. - :param max_depth: Maximum depth of a tree. 0 means no limit. However, tree - still grows by best-first. - - :param min_child_weight: Minimum sum of instance weight(hessian) needed in - a child. If the tree partition step results in a leaf node with the sum - of instance weight less than min_child_weight, then the building + :param minimum_child_weight: Minimum sum of instance weight(hessian) needed + in a child. If the tree partition step results in a leaf node with the + sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. - :param subsample_freq: Subsample frequency. 0 means no subsample. If - subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And - the subset will be updated on every Subsample iteratinos. + :param subsample_frequency: Subsample frequency for bagging. 0 means no + subsample. Specifies the frequency at which the bagging occurs, where + if this is set to N, the subsampling will happen at every N + iterations.This must be set with Subsample as this specifies the amount + to subsample. - :param subsample: Subsample ratio of the training instance. Setting it to - 0.5 means that LightGBM randomly collected half of the data instances - to grow trees and this will prevent overfitting. Range: (0,1]. + :param subsample_fraction: Subsample ratio of the training instance. + Setting it to 0.5 means that LightGBM randomly collected half of the + data instances to grow trees and this will prevent overfitting. Range: + (0,1]. :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. - :param reg_lambda: L2 regularization term on weights, increasing this value - will make model more conservative. - - :param reg_alpha: L1 regularization term on weights, increase this value - will make model more conservative. + :param l2_regularization: L2 regularization term on weights, increasing + this value will make model more conservative. - :param scale_pos_weight: Control the balance of positive and negative - weights, useful for unbalanced classes. A typical value to consider: - sum(negative cases) / sum(positive cases). + :param l1_regularization: L1 regularization term on weights, increase this + value will make model more conservative. :param params: Additional arguments sent to compute engine. @@ -101,31 +97,27 @@ def __init__( self, top_rate=0.2, other_rate=0.1, - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): core.__init__( self, top_rate=top_rate, other_rate=other_rate, - unbalanced_sets=unbalanced_sets, - min_split_gain=min_split_gain, - max_depth=max_depth, - min_child_weight=min_child_weight, - subsample_freq=subsample_freq, - subsample=subsample, + minimum_split_gain=minimum_split_gain, + maximum_tree_depth=maximum_tree_depth, + minimum_child_weight=minimum_child_weight, + subsample_frequency=subsample_frequency, + subsample_fraction=subsample_fraction, feature_fraction=feature_fraction, - reg_lambda=reg_lambda, - reg_alpha=reg_alpha, - scale_pos_weight=scale_pos_weight, + l2_regularization=l2_regularization, + l1_regularization=l1_regularization, **params) def get_params(self, deep=False): diff --git a/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py b/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py index 09c7677f..ea911977 100644 --- a/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py +++ b/src/python/nimbusml/ensemble/fastforestbinaryclassifier.py @@ -73,19 +73,20 @@ class FastForestBinaryClassifier( :param weight: see `Columns `_. - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param normalize: If ``Auto``, the choice to normalize depends on the preference declared by the algorithm. This is the default choice. If @@ -95,22 +96,22 @@ class FastForestBinaryClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_tree_output: Upper bound on absolute value of single tree - output. + :param maximum_output_magnitude_per_tree: Upper bound on absolute value of + single tree output. - :param quantile_sample_count: Number of labels to be sampled from each leaf - to make the distribtuion. + :param number_of_quantile_samples: Number of labels to be sampled from each + leaf to make the distribution. :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -128,19 +129,19 @@ class FastForestBinaryClassifier( :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -149,7 +150,8 @@ class FastForestBinaryClassifier( Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -168,17 +170,18 @@ class FastForestBinaryClassifier( :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -189,9 +192,6 @@ class FastForestBinaryClassifier( :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -213,44 +213,43 @@ class FastForestBinaryClassifier( @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, normalize='Auto', caching='Auto', - max_tree_output=100.0, - quantile_sample_count=100, + maximum_output_magnitude_per_tree=100.0, + number_of_quantile_samples=100, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=0.7, bagging_size=1, - example_fraction=0.7, - split_fraction=0.7, + bagging_example_fraction=0.7, + feature_fraction_per_split=0.7, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, feature=None, group_id=None, @@ -258,67 +257,66 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, - num_trees=num_trees, - num_leaves=num_leaves, - min_split=min_split, + number_of_trees=number_of_trees, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, normalize=normalize, caching=caching, - max_tree_output=max_tree_output, - quantile_sample_count=quantile_sample_count, + maximum_output_magnitude_per_tree=maximum_output_magnitude_per_tree, + number_of_quantile_samples=number_of_quantile_samples, parallel_trainer=parallel_trainer, - train_threads=train_threads, + number_of_threads=number_of_threads, random_state=random_state, - feature_select_seed=feature_select_seed, + feature_selection_seed=feature_selection_seed, entropy_coefficient=entropy_coefficient, histogram_pool_size=histogram_pool_size, disk_transpose=disk_transpose, feature_flocks=feature_flocks, categorical_split=categorical_split, - max_categorical_groups_per_node=max_categorical_groups_per_node, - max_categorical_split_points=max_categorical_split_points, - min_docs_percentage_split=min_docs_percentage_split, - min_docs_for_categorical_split=min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=minimum_examples_for_categorical_split, bias=bias, bundling=bundling, - num_bins=num_bins, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, sparsify_threshold=sparsify_threshold, first_use_penalty=first_use_penalty, feature_reuse_penalty=feature_reuse_penalty, gain_conf_level=gain_conf_level, softmax_temperature=softmax_temperature, - execution_times=execution_times, + execution_time=execution_time, feature_fraction=feature_fraction, bagging_size=bagging_size, - example_fraction=example_fraction, - split_fraction=split_fraction, + bagging_example_fraction=bagging_example_fraction, + feature_fraction_per_split=feature_fraction_per_split, smoothing=smoothing, allow_empty_trees=allow_empty_trees, feature_compression_level=feature_compression_level, compress_ensemble=compress_ensemble, - max_trees_after_compression=max_trees_after_compression, test_frequency=test_frequency, **params) self.feature = feature diff --git a/src/python/nimbusml/ensemble/fastforestregressor.py b/src/python/nimbusml/ensemble/fastforestregressor.py index 9255d953..5a2affe4 100644 --- a/src/python/nimbusml/ensemble/fastforestregressor.py +++ b/src/python/nimbusml/ensemble/fastforestregressor.py @@ -82,19 +82,20 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin): :param weight: see `Columns `_. - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param normalize: If ``Auto``, the choice to normalize depends on the preference declared by the algorithm. This is the default choice. If @@ -104,23 +105,23 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin): normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param shuffle_labels: Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass. - :param quantile_sample_count: Number of labels to be sampled from each leaf - to make the distribtuion. + :param number_of_quantile_samples: Number of labels to be sampled from each + leaf to make the distribution. :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -138,19 +139,19 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin): :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -159,7 +160,8 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin): Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -178,17 +180,18 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin): :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -199,9 +202,6 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin): :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -223,44 +223,43 @@ class FastForestRegressor(core, BasePredictor, RegressorMixin): @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, normalize='Auto', caching='Auto', shuffle_labels=False, - quantile_sample_count=100, + number_of_quantile_samples=100, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=0.7, bagging_size=1, - example_fraction=0.7, - split_fraction=0.7, + bagging_example_fraction=0.7, + feature_fraction_per_split=0.7, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, feature=None, group_id=None, @@ -268,67 +267,66 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, - num_trees=num_trees, - num_leaves=num_leaves, - min_split=min_split, + number_of_trees=number_of_trees, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, normalize=normalize, caching=caching, shuffle_labels=shuffle_labels, - quantile_sample_count=quantile_sample_count, + number_of_quantile_samples=number_of_quantile_samples, parallel_trainer=parallel_trainer, - train_threads=train_threads, + number_of_threads=number_of_threads, random_state=random_state, - feature_select_seed=feature_select_seed, + feature_selection_seed=feature_selection_seed, entropy_coefficient=entropy_coefficient, histogram_pool_size=histogram_pool_size, disk_transpose=disk_transpose, feature_flocks=feature_flocks, categorical_split=categorical_split, - max_categorical_groups_per_node=max_categorical_groups_per_node, - max_categorical_split_points=max_categorical_split_points, - min_docs_percentage_split=min_docs_percentage_split, - min_docs_for_categorical_split=min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=minimum_examples_for_categorical_split, bias=bias, bundling=bundling, - num_bins=num_bins, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, sparsify_threshold=sparsify_threshold, first_use_penalty=first_use_penalty, feature_reuse_penalty=feature_reuse_penalty, gain_conf_level=gain_conf_level, softmax_temperature=softmax_temperature, - execution_times=execution_times, + execution_time=execution_time, feature_fraction=feature_fraction, bagging_size=bagging_size, - example_fraction=example_fraction, - split_fraction=split_fraction, + bagging_example_fraction=bagging_example_fraction, + feature_fraction_per_split=feature_fraction_per_split, smoothing=smoothing, allow_empty_trees=allow_empty_trees, feature_compression_level=feature_compression_level, compress_ensemble=compress_ensemble, - max_trees_after_compression=max_trees_after_compression, test_frequency=test_frequency, **params) self.feature = feature diff --git a/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py b/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py index 7989a1e9..8c12cb48 100644 --- a/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py +++ b/src/python/nimbusml/ensemble/fasttreesbinaryclassifier.py @@ -91,19 +91,20 @@ class FastTreesBinaryClassifier( :param weight: see `Columns `_. - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -120,18 +121,19 @@ class FastTreesBinaryClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param unbalanced_sets: Should we use derivatives optimized for unbalanced - sets. + :param unbalanced_sets: Option for using derivatives optimized for + unbalanced sets. - :param best_step_trees: Use best regression step trees?. + :param best_step_trees: Option for using best regression step trees. :param use_line_search: Should we use line search for a step size. - :param num_post_bracket_steps: Number of post-bracket line search steps. + :param maximum_number_of_line_search_steps: Number of post-bracket line + search steps. - :param min_step_size: Minimum line search step size. + :param minimum_step_size: Minimum line search step size. :param optimizer: Default is ``sgd``. @@ -160,7 +162,7 @@ class FastTreesBinaryClassifier( :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping. - :param max_tree_output: Upper bound on absolute value of single tree + :param maximum_tree_output: Upper bound on absolute value of single tree output. :param random_start: Training starts from random ordering (determined by @@ -175,17 +177,17 @@ class FastTreesBinaryClassifier( normal training). :param position_discount_freeform: The discount freeform which specifies - the per position discounts of documents in a query (uses a single + the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -203,19 +205,19 @@ class FastTreesBinaryClassifier( :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -224,7 +226,8 @@ class FastTreesBinaryClassifier( Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -243,17 +246,18 @@ class FastTreesBinaryClassifier( :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -264,9 +268,6 @@ class FastTreesBinaryClassifier( :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -288,20 +289,20 @@ class FastTreesBinaryClassifier( @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, learning_rate=0.2, normalize='Auto', caching='Auto', unbalanced_sets=False, best_step_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimizer='GradientDescent', early_stopping_rule=None, - early_stopping_metrics=0, + early_stopping_metrics=1, enable_pruning=False, use_tolerant_pruning=False, pruning_threshold=0.004, @@ -310,43 +311,42 @@ def __init__( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - example_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, feature=None, group_id=None, @@ -354,40 +354,40 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, - num_trees=num_trees, - num_leaves=num_leaves, - min_split=min_split, + number_of_trees=number_of_trees, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, learning_rate=learning_rate, normalize=normalize, caching=caching, unbalanced_sets=unbalanced_sets, best_step_trees=best_step_trees, use_line_search=use_line_search, - num_post_bracket_steps=num_post_bracket_steps, - min_step_size=min_step_size, + maximum_number_of_line_search_steps=maximum_number_of_line_search_steps, + minimum_step_size=minimum_step_size, optimizer=optimizer, early_stopping_rule=early_stopping_rule, early_stopping_metrics=early_stopping_metrics, @@ -399,43 +399,42 @@ def __init__( dropout_rate=dropout_rate, get_derivatives_sample_rate=get_derivatives_sample_rate, write_last_ensemble=write_last_ensemble, - max_tree_output=max_tree_output, + maximum_tree_output=maximum_tree_output, random_start=random_start, filter_zero_lambdas=filter_zero_lambdas, baseline_scores_formula=baseline_scores_formula, baseline_alpha_risk=baseline_alpha_risk, position_discount_freeform=position_discount_freeform, parallel_trainer=parallel_trainer, - train_threads=train_threads, + number_of_threads=number_of_threads, random_state=random_state, - feature_select_seed=feature_select_seed, + feature_selection_seed=feature_selection_seed, entropy_coefficient=entropy_coefficient, histogram_pool_size=histogram_pool_size, disk_transpose=disk_transpose, feature_flocks=feature_flocks, categorical_split=categorical_split, - max_categorical_groups_per_node=max_categorical_groups_per_node, - max_categorical_split_points=max_categorical_split_points, - min_docs_percentage_split=min_docs_percentage_split, - min_docs_for_categorical_split=min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=minimum_examples_for_categorical_split, bias=bias, bundling=bundling, - num_bins=num_bins, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, sparsify_threshold=sparsify_threshold, first_use_penalty=first_use_penalty, feature_reuse_penalty=feature_reuse_penalty, gain_conf_level=gain_conf_level, softmax_temperature=softmax_temperature, - execution_times=execution_times, + execution_time=execution_time, feature_fraction=feature_fraction, bagging_size=bagging_size, - example_fraction=example_fraction, - split_fraction=split_fraction, + bagging_example_fraction=bagging_example_fraction, + feature_fraction_per_split=feature_fraction_per_split, smoothing=smoothing, allow_empty_trees=allow_empty_trees, feature_compression_level=feature_compression_level, compress_ensemble=compress_ensemble, - max_trees_after_compression=max_trees_after_compression, test_frequency=test_frequency, **params) self.feature = feature diff --git a/src/python/nimbusml/ensemble/fasttreesregressor.py b/src/python/nimbusml/ensemble/fasttreesregressor.py index 3a55bb4c..c3994230 100644 --- a/src/python/nimbusml/ensemble/fasttreesregressor.py +++ b/src/python/nimbusml/ensemble/fasttreesregressor.py @@ -93,19 +93,20 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): :param weight: see `Columns `_. - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -122,15 +123,16 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param best_step_trees: Use best regression step trees?. + :param best_step_trees: Option for using best regression step trees. :param use_line_search: Should we use line search for a step size. - :param num_post_bracket_steps: Number of post-bracket line search steps. + :param maximum_number_of_line_search_steps: Number of post-bracket line + search steps. - :param min_step_size: Minimum line search step size. + :param minimum_step_size: Minimum line search step size. :param optimizer: Default is ``sgd``. @@ -159,7 +161,7 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping. - :param max_tree_output: Upper bound on absolute value of single tree + :param maximum_tree_output: Upper bound on absolute value of single tree output. :param random_start: Training starts from random ordering (determined by @@ -174,17 +176,17 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): normal training). :param position_discount_freeform: The discount freeform which specifies - the per position discounts of documents in a query (uses a single + the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -202,19 +204,19 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -223,7 +225,8 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -242,17 +245,18 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -263,9 +267,6 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -287,16 +288,16 @@ class FastTreesRegressor(core, BasePredictor, RegressorMixin): @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, learning_rate=0.2, normalize='Auto', caching='Auto', best_step_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimizer='GradientDescent', early_stopping_rule=None, early_stopping_metrics=1, @@ -308,43 +309,42 @@ def __init__( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - example_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, feature=None, group_id=None, @@ -352,39 +352,39 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, - num_trees=num_trees, - num_leaves=num_leaves, - min_split=min_split, + number_of_trees=number_of_trees, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, learning_rate=learning_rate, normalize=normalize, caching=caching, best_step_trees=best_step_trees, use_line_search=use_line_search, - num_post_bracket_steps=num_post_bracket_steps, - min_step_size=min_step_size, + maximum_number_of_line_search_steps=maximum_number_of_line_search_steps, + minimum_step_size=minimum_step_size, optimizer=optimizer, early_stopping_rule=early_stopping_rule, early_stopping_metrics=early_stopping_metrics, @@ -396,43 +396,42 @@ def __init__( dropout_rate=dropout_rate, get_derivatives_sample_rate=get_derivatives_sample_rate, write_last_ensemble=write_last_ensemble, - max_tree_output=max_tree_output, + maximum_tree_output=maximum_tree_output, random_start=random_start, filter_zero_lambdas=filter_zero_lambdas, baseline_scores_formula=baseline_scores_formula, baseline_alpha_risk=baseline_alpha_risk, position_discount_freeform=position_discount_freeform, parallel_trainer=parallel_trainer, - train_threads=train_threads, + number_of_threads=number_of_threads, random_state=random_state, - feature_select_seed=feature_select_seed, + feature_selection_seed=feature_selection_seed, entropy_coefficient=entropy_coefficient, histogram_pool_size=histogram_pool_size, disk_transpose=disk_transpose, feature_flocks=feature_flocks, categorical_split=categorical_split, - max_categorical_groups_per_node=max_categorical_groups_per_node, - max_categorical_split_points=max_categorical_split_points, - min_docs_percentage_split=min_docs_percentage_split, - min_docs_for_categorical_split=min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=minimum_examples_for_categorical_split, bias=bias, bundling=bundling, - num_bins=num_bins, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, sparsify_threshold=sparsify_threshold, first_use_penalty=first_use_penalty, feature_reuse_penalty=feature_reuse_penalty, gain_conf_level=gain_conf_level, softmax_temperature=softmax_temperature, - execution_times=execution_times, + execution_time=execution_time, feature_fraction=feature_fraction, bagging_size=bagging_size, - example_fraction=example_fraction, - split_fraction=split_fraction, + bagging_example_fraction=bagging_example_fraction, + feature_fraction_per_split=feature_fraction_per_split, smoothing=smoothing, allow_empty_trees=allow_empty_trees, feature_compression_level=feature_compression_level, compress_ensemble=compress_ensemble, - max_trees_after_compression=max_trees_after_compression, test_frequency=test_frequency, **params) self.feature = feature diff --git a/src/python/nimbusml/ensemble/fasttreestweedieregressor.py b/src/python/nimbusml/ensemble/fasttreestweedieregressor.py index e9ac1750..1db266b7 100644 --- a/src/python/nimbusml/ensemble/fasttreestweedieregressor.py +++ b/src/python/nimbusml/ensemble/fasttreestweedieregressor.py @@ -48,19 +48,20 @@ class FastTreesTweedieRegressor( :param weight: see `Columns `_. - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -91,19 +92,20 @@ class FastTreesTweedieRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param index: Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss. - :param best_step_trees: Use best regression step trees?. + :param best_step_trees: Option for using best regression step trees. :param use_line_search: Should we use line search for a step size. - :param num_post_bracket_steps: Number of post-bracket line search steps. + :param maximum_number_of_line_search_steps: Number of post-bracket line + search steps. - :param min_step_size: Minimum line search step size. + :param minimum_step_size: Minimum line search step size. :param optimizer: Default is ``sgd``. @@ -132,7 +134,7 @@ class FastTreesTweedieRegressor( :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping. - :param max_tree_output: Upper bound on absolute value of single tree + :param maximum_tree_output: Upper bound on absolute value of single tree output. :param random_start: Training starts from random ordering (determined by @@ -147,17 +149,17 @@ class FastTreesTweedieRegressor( normal training). :param position_discount_freeform: The discount freeform which specifies - the per position discounts of documents in a query (uses a single + the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -175,19 +177,19 @@ class FastTreesTweedieRegressor( :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -196,7 +198,8 @@ class FastTreesTweedieRegressor( Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -215,17 +218,18 @@ class FastTreesTweedieRegressor( :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -236,9 +240,6 @@ class FastTreesTweedieRegressor( :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -261,20 +262,20 @@ class FastTreesTweedieRegressor( @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, learning_rate=0.2, normalize='Auto', caching='Auto', index=1.5, best_step_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimizer='GradientDescent', early_stopping_rule=None, - early_stopping_metrics=0, + early_stopping_metrics=1, enable_pruning=False, use_tolerant_pruning=False, pruning_threshold=0.004, @@ -283,43 +284,42 @@ def __init__( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - example_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, feature=None, group_id=None, @@ -327,40 +327,40 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, - num_trees=num_trees, - num_leaves=num_leaves, - min_split=min_split, + number_of_trees=number_of_trees, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, learning_rate=learning_rate, normalize=normalize, caching=caching, index=index, best_step_trees=best_step_trees, use_line_search=use_line_search, - num_post_bracket_steps=num_post_bracket_steps, - min_step_size=min_step_size, + maximum_number_of_line_search_steps=maximum_number_of_line_search_steps, + minimum_step_size=minimum_step_size, optimizer=optimizer, early_stopping_rule=early_stopping_rule, early_stopping_metrics=early_stopping_metrics, @@ -372,43 +372,42 @@ def __init__( dropout_rate=dropout_rate, get_derivatives_sample_rate=get_derivatives_sample_rate, write_last_ensemble=write_last_ensemble, - max_tree_output=max_tree_output, + maximum_tree_output=maximum_tree_output, random_start=random_start, filter_zero_lambdas=filter_zero_lambdas, baseline_scores_formula=baseline_scores_formula, baseline_alpha_risk=baseline_alpha_risk, position_discount_freeform=position_discount_freeform, parallel_trainer=parallel_trainer, - train_threads=train_threads, + number_of_threads=number_of_threads, random_state=random_state, - feature_select_seed=feature_select_seed, + feature_selection_seed=feature_selection_seed, entropy_coefficient=entropy_coefficient, histogram_pool_size=histogram_pool_size, disk_transpose=disk_transpose, feature_flocks=feature_flocks, categorical_split=categorical_split, - max_categorical_groups_per_node=max_categorical_groups_per_node, - max_categorical_split_points=max_categorical_split_points, - min_docs_percentage_split=min_docs_percentage_split, - min_docs_for_categorical_split=min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=minimum_examples_for_categorical_split, bias=bias, bundling=bundling, - num_bins=num_bins, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, sparsify_threshold=sparsify_threshold, first_use_penalty=first_use_penalty, feature_reuse_penalty=feature_reuse_penalty, gain_conf_level=gain_conf_level, softmax_temperature=softmax_temperature, - execution_times=execution_times, + execution_time=execution_time, feature_fraction=feature_fraction, bagging_size=bagging_size, - example_fraction=example_fraction, - split_fraction=split_fraction, + bagging_example_fraction=bagging_example_fraction, + feature_fraction_per_split=feature_fraction_per_split, smoothing=smoothing, allow_empty_trees=allow_empty_trees, feature_compression_level=feature_compression_level, compress_ensemble=compress_ensemble, - max_trees_after_compression=max_trees_after_compression, test_frequency=test_frequency, **params) self.feature = feature diff --git a/src/python/nimbusml/ensemble/gambinaryclassifier.py b/src/python/nimbusml/ensemble/gambinaryclassifier.py index 2427c2ba..eb08e95c 100644 --- a/src/python/nimbusml/ensemble/gambinaryclassifier.py +++ b/src/python/nimbusml/ensemble/gambinaryclassifier.py @@ -87,10 +87,13 @@ class GamBinaryClassifier(core, BasePredictor, ClassifierMixin): :param weight: see `Columns `_. - :param num_iterations: Total number of iterations over all features. + :param number_of_iterations: Total number of iterations over all features. - :param min_documents: Minimum number of training instances required to form - a partition. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -121,7 +124,7 @@ class GamBinaryClassifier(core, BasePredictor, ClassifierMixin): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param unbalanced_sets: Should we use derivatives optimized for unbalanced sets. @@ -132,15 +135,16 @@ class GamBinaryClassifier(core, BasePredictor, ClassifierMixin): :param gain_conf_level: Tree fitting gain confidence requirement (should be in the range [0,1) ). - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param disk_transpose: Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. - :param max_output: Upper bound on absolute value of single output. + :param maximum_tree_output: Upper bound on absolute value of single output. :param get_derivatives_sample_rate: Sample each query 1 in k times in the GetDerivatives function. @@ -171,18 +175,18 @@ class GamBinaryClassifier(core, BasePredictor, ClassifierMixin): @trace def __init__( self, - num_iterations=9500, - min_documents=10, + number_of_iterations=9500, + minimum_example_count_per_leaf=10, learning_rate=0.002, normalize='Auto', caching='Auto', unbalanced_sets=False, entropy_coefficient=0.0, gain_conf_level=0, - train_threads=None, + number_of_threads=None, disk_transpose=None, - num_bins=255, - max_output=float('inf'), + maximum_bin_count_per_feature=255, + maximum_tree_output=float('inf'), get_derivatives_sample_rate=1, random_state=123, feature_flocks=True, @@ -192,36 +196,36 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, - num_iterations=num_iterations, - min_documents=min_documents, + number_of_iterations=number_of_iterations, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, learning_rate=learning_rate, normalize=normalize, caching=caching, unbalanced_sets=unbalanced_sets, entropy_coefficient=entropy_coefficient, gain_conf_level=gain_conf_level, - train_threads=train_threads, + number_of_threads=number_of_threads, disk_transpose=disk_transpose, - num_bins=num_bins, - max_output=max_output, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, + maximum_tree_output=maximum_tree_output, get_derivatives_sample_rate=get_derivatives_sample_rate, random_state=random_state, feature_flocks=feature_flocks, diff --git a/src/python/nimbusml/ensemble/gamregressor.py b/src/python/nimbusml/ensemble/gamregressor.py index 13587cd8..c57ad499 100644 --- a/src/python/nimbusml/ensemble/gamregressor.py +++ b/src/python/nimbusml/ensemble/gamregressor.py @@ -86,10 +86,13 @@ class GamRegressor(core, BasePredictor, RegressorMixin): :param weight: see `Columns `_. - :param num_iterations: Total number of iterations over all features. + :param number_of_iterations: Total number of iterations over all features. - :param min_documents: Minimum number of training instances required to form - a partition. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -120,7 +123,7 @@ class GamRegressor(core, BasePredictor, RegressorMixin): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param pruning_metrics: Metric for pruning. (For regression, 1: L1, 2:L2; default L2). @@ -131,15 +134,16 @@ class GamRegressor(core, BasePredictor, RegressorMixin): :param gain_conf_level: Tree fitting gain confidence requirement (should be in the range [0,1) ). - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param disk_transpose: Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. - :param max_output: Upper bound on absolute value of single output. + :param maximum_tree_output: Upper bound on absolute value of single output. :param get_derivatives_sample_rate: Sample each query 1 in k times in the GetDerivatives function. @@ -171,18 +175,18 @@ class GamRegressor(core, BasePredictor, RegressorMixin): @trace def __init__( self, - num_iterations=9500, - min_documents=10, + number_of_iterations=9500, + minimum_example_count_per_leaf=10, learning_rate=0.002, normalize='Auto', caching='Auto', pruning_metrics=2, entropy_coefficient=0.0, gain_conf_level=0, - train_threads=None, + number_of_threads=None, disk_transpose=None, - num_bins=255, - max_output=float('inf'), + maximum_bin_count_per_feature=255, + maximum_tree_output=float('inf'), get_derivatives_sample_rate=1, random_state=123, feature_flocks=True, @@ -192,36 +196,36 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, - num_iterations=num_iterations, - min_documents=min_documents, + number_of_iterations=number_of_iterations, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, learning_rate=learning_rate, normalize=normalize, caching=caching, pruning_metrics=pruning_metrics, entropy_coefficient=entropy_coefficient, gain_conf_level=gain_conf_level, - train_threads=train_threads, + number_of_threads=number_of_threads, disk_transpose=disk_transpose, - num_bins=num_bins, - max_output=max_output, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, + maximum_tree_output=maximum_tree_output, get_derivatives_sample_rate=get_derivatives_sample_rate, random_state=random_state, feature_flocks=feature_flocks, diff --git a/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py b/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py index 8f0d3673..c87bbbb0 100644 --- a/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py @@ -45,17 +45,25 @@ class LightGbmBinaryClassifier( :param weight: see `Columns `_. - :param num_boost_round: Number of iterations. - - :param learning_rate: Shrinkage rate for trees, used to prevent over- - fitting. Range: (0,1]. - - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. - - :param min_data_per_leaf: Minimum number of instances needed in a child. + :param number_of_iterations: Number of iterations. + + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. + + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. + + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param booster: Which booster to use. Available options are: @@ -71,43 +79,50 @@ class LightGbmBinaryClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_bin: Max number of bucket bin for features. + :param unbalanced_sets: Use for binary classification when training data is + not balanced. - :param verbose_eval: Verbose. + :param weight_of_positive_examples: Control the balance of positive and + negative weights, useful for unbalanced classes. A typical value to + consider: sum(negative cases) / sum(positive cases). - :param silent: Printing running messages. + :param sigmoid: Parameter for the sigmoid function. - :param n_thread: Number of parallel threads used to run LightGBM. + :param evaluation_metric: Evaluation metrics. - :param eval_metric: Evaluation metrics. + :param maximum_bin_count_per_feature: Maximum number of bucket bin for + features. - :param use_softmax: Use softmax loss for the multi classification. + :param verbose: Verbose. - :param early_stopping_round: Rounds of early stopping, 0 will disable it. + :param silent: Printing running messages. - :param custom_gains: Comma seperated list of gains associated to each - relevance label. + :param number_of_threads: Number of parallel threads used to run LightGBM. - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. + :param early_stopping_round: Rounds of early stopping, 0 will disable it. :param batch_size: Number of entries in a batch when loading data. - :param use_cat: Enable categorical split or not. + :param use_categorical_split: Enable categorical split or not. + + :param handle_missing_value: Enable special handling of missing value or + not. - :param use_missing: Enable missing value auto infer or not. + :param minimum_example_count_per_group: Minimum number of instances per + categorical group. - :param min_data_per_group: Min number of instances per categorical group. + :param maximum_categorical_split_point_count: Max number of categorical + thresholds. - :param max_cat_threshold: Max number of categorical thresholds. + :param categorical_smoothing: Lapalace smooth term in categorical feature + spilt. Avoid the bias of small categories. - :param cat_smooth: Lapalace smooth term in categorical feature spilt. Avoid - the bias of small categories. + :param l2_categorical_regularization: L2 Regularization for categorical + split. - :param cat_l2: L2 Regularization for categorical split. + :param random_state: Sets the random seed for LightGBM to use. :param parallel_trainer: Parallel LightGBM Learning Algorithm. @@ -131,29 +146,30 @@ class LightGbmBinaryClassifier( @trace def __init__( self, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, + number_of_leaves=None, + minimum_example_count_per_leaf=None, booster=None, normalize='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + unbalanced_sets=False, + weight_of_positive_examples=1.0, + sigmoid=0.5, + evaluation_metric='Logloss', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + random_state=None, parallel_trainer=None, feature=None, group_id=None, @@ -161,52 +177,53 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, - num_boost_round=num_boost_round, + number_of_iterations=number_of_iterations, learning_rate=learning_rate, - num_leaves=num_leaves, - min_data_per_leaf=min_data_per_leaf, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, booster=booster, normalize=normalize, caching=caching, - max_bin=max_bin, - verbose_eval=verbose_eval, + unbalanced_sets=unbalanced_sets, + weight_of_positive_examples=weight_of_positive_examples, + sigmoid=sigmoid, + evaluation_metric=evaluation_metric, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, + verbose=verbose, silent=silent, - n_thread=n_thread, - eval_metric=eval_metric, - use_softmax=use_softmax, + number_of_threads=number_of_threads, early_stopping_round=early_stopping_round, - custom_gains=custom_gains, - sigmoid=sigmoid, batch_size=batch_size, - use_cat=use_cat, - use_missing=use_missing, - min_data_per_group=min_data_per_group, - max_cat_threshold=max_cat_threshold, - cat_smooth=cat_smooth, - cat_l2=cat_l2, + use_categorical_split=use_categorical_split, + handle_missing_value=handle_missing_value, + minimum_example_count_per_group=minimum_example_count_per_group, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + categorical_smoothing=categorical_smoothing, + l2_categorical_regularization=l2_categorical_regularization, + random_state=random_state, parallel_trainer=parallel_trainer, **params) self.feature = feature diff --git a/src/python/nimbusml/ensemble/lightgbmclassifier.py b/src/python/nimbusml/ensemble/lightgbmclassifier.py index a8e56eaf..b59c4f7c 100644 --- a/src/python/nimbusml/ensemble/lightgbmclassifier.py +++ b/src/python/nimbusml/ensemble/lightgbmclassifier.py @@ -42,17 +42,25 @@ class LightGbmClassifier(core, BasePredictor, ClassifierMixin): :param weight: see `Columns `_. - :param num_boost_round: Number of iterations. - - :param learning_rate: Shrinkage rate for trees, used to prevent over- - fitting. Range: (0,1]. - - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. - - :param min_data_per_leaf: Minimum number of instances needed in a child. + :param number_of_iterations: Number of iterations. + + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. + + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. + + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param booster: Which booster to use. Available options are: @@ -68,43 +76,45 @@ class LightGbmClassifier(core, BasePredictor, ClassifierMixin): normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_bin: Max number of bucket bin for features. - - :param verbose_eval: Verbose. + :param use_softmax: Use softmax loss for the multi classification. - :param silent: Printing running messages. + :param sigmoid: Parameter for the sigmoid function. - :param n_thread: Number of parallel threads used to run LightGBM. + :param evaluation_metric: Evaluation metrics. - :param eval_metric: Evaluation metrics. + :param maximum_bin_count_per_feature: Maximum number of bucket bin for + features. - :param use_softmax: Use softmax loss for the multi classification. + :param verbose: Verbose. - :param early_stopping_round: Rounds of early stopping, 0 will disable it. + :param silent: Printing running messages. - :param custom_gains: Comma seperated list of gains associated to each - relevance label. + :param number_of_threads: Number of parallel threads used to run LightGBM. - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. + :param early_stopping_round: Rounds of early stopping, 0 will disable it. :param batch_size: Number of entries in a batch when loading data. - :param use_cat: Enable categorical split or not. + :param use_categorical_split: Enable categorical split or not. - :param use_missing: Enable missing value auto infer or not. + :param handle_missing_value: Enable special handling of missing value or + not. - :param min_data_per_group: Min number of instances per categorical group. + :param minimum_example_count_per_group: Minimum number of instances per + categorical group. - :param max_cat_threshold: Max number of categorical thresholds. + :param maximum_categorical_split_point_count: Max number of categorical + thresholds. - :param cat_smooth: Lapalace smooth term in categorical feature spilt. Avoid - the bias of small categories. + :param categorical_smoothing: Lapalace smooth term in categorical feature + spilt. Avoid the bias of small categories. - :param cat_l2: L2 Regularization for categorical split. + :param l2_categorical_regularization: L2 Regularization for categorical + split. + + :param random_state: Sets the random seed for LightGBM to use. :param parallel_trainer: Parallel LightGBM Learning Algorithm. @@ -128,29 +138,29 @@ class LightGbmClassifier(core, BasePredictor, ClassifierMixin): @trace def __init__( self, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, + number_of_leaves=None, + minimum_example_count_per_leaf=None, booster=None, normalize='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, - silent=True, - n_thread=None, - eval_metric='DefaultMetric', use_softmax=None, - early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', sigmoid=0.5, + evaluation_metric='Error', + maximum_bin_count_per_feature=255, + verbose=False, + silent=True, + number_of_threads=None, + early_stopping_round=0, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + random_state=None, parallel_trainer=None, feature=None, group_id=None, @@ -158,52 +168,52 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, - num_boost_round=num_boost_round, + number_of_iterations=number_of_iterations, learning_rate=learning_rate, - num_leaves=num_leaves, - min_data_per_leaf=min_data_per_leaf, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, booster=booster, normalize=normalize, caching=caching, - max_bin=max_bin, - verbose_eval=verbose_eval, - silent=silent, - n_thread=n_thread, - eval_metric=eval_metric, use_softmax=use_softmax, - early_stopping_round=early_stopping_round, - custom_gains=custom_gains, sigmoid=sigmoid, + evaluation_metric=evaluation_metric, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, + verbose=verbose, + silent=silent, + number_of_threads=number_of_threads, + early_stopping_round=early_stopping_round, batch_size=batch_size, - use_cat=use_cat, - use_missing=use_missing, - min_data_per_group=min_data_per_group, - max_cat_threshold=max_cat_threshold, - cat_smooth=cat_smooth, - cat_l2=cat_l2, + use_categorical_split=use_categorical_split, + handle_missing_value=handle_missing_value, + minimum_example_count_per_group=minimum_example_count_per_group, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + categorical_smoothing=categorical_smoothing, + l2_categorical_regularization=l2_categorical_regularization, + random_state=random_state, parallel_trainer=parallel_trainer, **params) self.feature = feature diff --git a/src/python/nimbusml/ensemble/lightgbmranker.py b/src/python/nimbusml/ensemble/lightgbmranker.py index 890b4de0..fb96f5cd 100644 --- a/src/python/nimbusml/ensemble/lightgbmranker.py +++ b/src/python/nimbusml/ensemble/lightgbmranker.py @@ -45,17 +45,25 @@ class LightGbmRanker(core, BasePredictor, ClassifierMixin): :param weight: see `Columns `_. - :param num_boost_round: Number of iterations. - - :param learning_rate: Shrinkage rate for trees, used to prevent over- - fitting. Range: (0,1]. - - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. - - :param min_data_per_leaf: Minimum number of instances needed in a child. + :param number_of_iterations: Number of iterations. + + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. + + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. + + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param booster: Which booster to use. Available options are: @@ -71,43 +79,45 @@ class LightGbmRanker(core, BasePredictor, ClassifierMixin): normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. - - :param max_bin: Max number of bucket bin for features. + :param caching: Whether trainer should cache input training data. - :param verbose_eval: Verbose. + :param custom_gains: An array of gains associated to each relevance label. - :param silent: Printing running messages. + :param sigmoid: Parameter for the sigmoid function. - :param n_thread: Number of parallel threads used to run LightGBM. + :param evaluation_metric: Evaluation metrics. - :param eval_metric: Evaluation metrics. + :param maximum_bin_count_per_feature: Maximum number of bucket bin for + features. - :param use_softmax: Use softmax loss for the multi classification. + :param verbose: Verbose. - :param early_stopping_round: Rounds of early stopping, 0 will disable it. + :param silent: Printing running messages. - :param custom_gains: Comma seperated list of gains associated to each - relevance label. + :param number_of_threads: Number of parallel threads used to run LightGBM. - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. + :param early_stopping_round: Rounds of early stopping, 0 will disable it. :param batch_size: Number of entries in a batch when loading data. - :param use_cat: Enable categorical split or not. + :param use_categorical_split: Enable categorical split or not. - :param use_missing: Enable missing value auto infer or not. + :param handle_missing_value: Enable special handling of missing value or + not. - :param min_data_per_group: Min number of instances per categorical group. + :param minimum_example_count_per_group: Minimum number of instances per + categorical group. - :param max_cat_threshold: Max number of categorical thresholds. + :param maximum_categorical_split_point_count: Max number of categorical + thresholds. - :param cat_smooth: Lapalace smooth term in categorical feature spilt. Avoid - the bias of small categories. + :param categorical_smoothing: Lapalace smooth term in categorical feature + spilt. Avoid the bias of small categories. - :param cat_l2: L2 Regularization for categorical split. + :param l2_categorical_regularization: L2 Regularization for categorical + split. + + :param random_state: Sets the random seed for LightGBM to use. :param parallel_trainer: Parallel LightGBM Learning Algorithm. @@ -131,29 +141,29 @@ class LightGbmRanker(core, BasePredictor, ClassifierMixin): @trace def __init__( self, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, + number_of_leaves=None, + minimum_example_count_per_leaf=None, booster=None, normalize='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + custom_gains=[0, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095], + sigmoid=0.5, + evaluation_metric='NormalizedDiscountedCumulativeGain', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + random_state=None, parallel_trainer=None, feature=None, group_id=None, @@ -161,52 +171,52 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='ranker', **params) core.__init__( self, - num_boost_round=num_boost_round, + number_of_iterations=number_of_iterations, learning_rate=learning_rate, - num_leaves=num_leaves, - min_data_per_leaf=min_data_per_leaf, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, booster=booster, normalize=normalize, caching=caching, - max_bin=max_bin, - verbose_eval=verbose_eval, - silent=silent, - n_thread=n_thread, - eval_metric=eval_metric, - use_softmax=use_softmax, - early_stopping_round=early_stopping_round, custom_gains=custom_gains, sigmoid=sigmoid, + evaluation_metric=evaluation_metric, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, + verbose=verbose, + silent=silent, + number_of_threads=number_of_threads, + early_stopping_round=early_stopping_round, batch_size=batch_size, - use_cat=use_cat, - use_missing=use_missing, - min_data_per_group=min_data_per_group, - max_cat_threshold=max_cat_threshold, - cat_smooth=cat_smooth, - cat_l2=cat_l2, + use_categorical_split=use_categorical_split, + handle_missing_value=handle_missing_value, + minimum_example_count_per_group=minimum_example_count_per_group, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + categorical_smoothing=categorical_smoothing, + l2_categorical_regularization=l2_categorical_regularization, + random_state=random_state, parallel_trainer=parallel_trainer, **params) self.feature = feature diff --git a/src/python/nimbusml/ensemble/lightgbmregressor.py b/src/python/nimbusml/ensemble/lightgbmregressor.py index 8ad088c4..0d0a69ae 100644 --- a/src/python/nimbusml/ensemble/lightgbmregressor.py +++ b/src/python/nimbusml/ensemble/lightgbmregressor.py @@ -42,17 +42,25 @@ class LightGbmRegressor(core, BasePredictor, RegressorMixin): :param weight: see `Columns `_. - :param num_boost_round: Number of iterations. - - :param learning_rate: Shrinkage rate for trees, used to prevent over- - fitting. Range: (0,1]. - - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. - - :param min_data_per_leaf: Minimum number of instances needed in a child. + :param number_of_iterations: Number of iterations. + + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. + + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. + + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param booster: Which booster to use. Available options are: @@ -68,43 +76,41 @@ class LightGbmRegressor(core, BasePredictor, RegressorMixin): normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_bin: Max number of bucket bin for features. + :param evaluation_metric: Evaluation metrics. - :param verbose_eval: Verbose. - - :param silent: Printing running messages. + :param maximum_bin_count_per_feature: Maximum number of bucket bin for + features. - :param n_thread: Number of parallel threads used to run LightGBM. + :param verbose: Verbose. - :param eval_metric: Evaluation metrics. + :param silent: Printing running messages. - :param use_softmax: Use softmax loss for the multi classification. + :param number_of_threads: Number of parallel threads used to run LightGBM. :param early_stopping_round: Rounds of early stopping, 0 will disable it. - :param custom_gains: Comma seperated list of gains associated to each - relevance label. - - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. - :param batch_size: Number of entries in a batch when loading data. - :param use_cat: Enable categorical split or not. + :param use_categorical_split: Enable categorical split or not. + + :param handle_missing_value: Enable special handling of missing value or + not. - :param use_missing: Enable missing value auto infer or not. + :param minimum_example_count_per_group: Minimum number of instances per + categorical group. - :param min_data_per_group: Min number of instances per categorical group. + :param maximum_categorical_split_point_count: Max number of categorical + thresholds. - :param max_cat_threshold: Max number of categorical thresholds. + :param categorical_smoothing: Lapalace smooth term in categorical feature + spilt. Avoid the bias of small categories. - :param cat_smooth: Lapalace smooth term in categorical feature spilt. Avoid - the bias of small categories. + :param l2_categorical_regularization: L2 Regularization for categorical + split. - :param cat_l2: L2 Regularization for categorical split. + :param random_state: Sets the random seed for LightGBM to use. :param parallel_trainer: Parallel LightGBM Learning Algorithm. @@ -128,29 +134,27 @@ class LightGbmRegressor(core, BasePredictor, RegressorMixin): @trace def __init__( self, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, + number_of_leaves=None, + minimum_example_count_per_leaf=None, booster=None, normalize='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + evaluation_metric='RootMeanSquaredError', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + random_state=None, parallel_trainer=None, feature=None, group_id=None, @@ -158,52 +162,50 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'group_id_column' in params: + params['feature_column_name'] = feature + if 'row_group_column_name' in params: raise NameError( - "'group_id_column' must be renamed to 'group_id'") + "'row_group_column_name' must be renamed to 'group_id'") if group_id: - params['group_id_column'] = group_id - if 'label_column' in params: + params['row_group_column_name'] = group_id + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, - num_boost_round=num_boost_round, + number_of_iterations=number_of_iterations, learning_rate=learning_rate, - num_leaves=num_leaves, - min_data_per_leaf=min_data_per_leaf, + number_of_leaves=number_of_leaves, + minimum_example_count_per_leaf=minimum_example_count_per_leaf, booster=booster, normalize=normalize, caching=caching, - max_bin=max_bin, - verbose_eval=verbose_eval, + evaluation_metric=evaluation_metric, + maximum_bin_count_per_feature=maximum_bin_count_per_feature, + verbose=verbose, silent=silent, - n_thread=n_thread, - eval_metric=eval_metric, - use_softmax=use_softmax, + number_of_threads=number_of_threads, early_stopping_round=early_stopping_round, - custom_gains=custom_gains, - sigmoid=sigmoid, batch_size=batch_size, - use_cat=use_cat, - use_missing=use_missing, - min_data_per_group=min_data_per_group, - max_cat_threshold=max_cat_threshold, - cat_smooth=cat_smooth, - cat_l2=cat_l2, + use_categorical_split=use_categorical_split, + handle_missing_value=handle_missing_value, + minimum_example_count_per_group=minimum_example_count_per_group, + maximum_categorical_split_point_count=maximum_categorical_split_point_count, + categorical_smoothing=categorical_smoothing, + l2_categorical_regularization=l2_categorical_regularization, + random_state=random_state, parallel_trainer=parallel_trainer, **params) self.feature = feature diff --git a/src/python/nimbusml/examples/CountSelector.py b/src/python/nimbusml/examples/CountSelector.py index 9c00c37e..434f00e1 100644 --- a/src/python/nimbusml/examples/CountSelector.py +++ b/src/python/nimbusml/examples/CountSelector.py @@ -18,7 +18,7 @@ pip = Pipeline([ - OneHotHashVectorizer(columns={'edu': 'education'}, hash_bits=2), + OneHotHashVectorizer(columns={'edu': 'education'}, number_of_bits=2), CountSelector(count=5, columns=['edu']) ]) features_selection = pip.fit_transform(data) diff --git a/src/python/nimbusml/examples/PipelineWithGridSearchCV1.py b/src/python/nimbusml/examples/PipelineWithGridSearchCV1.py index fcd1fc47..0aa30c7b 100644 --- a/src/python/nimbusml/examples/PipelineWithGridSearchCV1.py +++ b/src/python/nimbusml/examples/PipelineWithGridSearchCV1.py @@ -7,26 +7,26 @@ OneHotVectorizer from sklearn.model_selection import GridSearchCV -df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], - workclass=['X', 'X', 'Y', 'Y', 'Y'], - y=[1, 0, 1, 0, 0])) +df = pd.DataFrame(dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], + workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], + y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] pipe = Pipeline([ ('cat', OneHotVectorizer() << 'education'), # unnamed step, stays same in grid search OneHotHashVectorizer() << 'workclass', - # this instance of FastTreesBinaryClassifier with num_trees 0 will be + # this instance of FastTreesBinaryClassifier with number_of_trees 0 will be # never run by grid search as its not a part of param_grid below - ('learner', FastTreesBinaryClassifier(num_trees=0, num_leaves=2)) + ('learner', FastTreesBinaryClassifier(number_of_trees=0, number_of_leaves=2)) ]) param_grid = dict( cat__output_kind=[ - 'Ind', 'Bin'], learner__num_trees=[ + 'Indicator', 'Binary'], learner__number_of_trees=[ 1, 2, 3]) grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn') grid.fit(X, y) print(grid.best_params_) -# {'cat__output_kind': 'Ind', 'learner__num_trees': 1} +# {'cat__output_kind': 'Indicator', 'learner__number_of_trees': 1} diff --git a/src/python/nimbusml/examples/PipelineWithGridSearchCV2.py b/src/python/nimbusml/examples/PipelineWithGridSearchCV2.py index 524f8ddd..8d7fc2d2 100644 --- a/src/python/nimbusml/examples/PipelineWithGridSearchCV2.py +++ b/src/python/nimbusml/examples/PipelineWithGridSearchCV2.py @@ -8,9 +8,9 @@ LogisticRegressionBinaryClassifier from sklearn.model_selection import GridSearchCV -df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], - workclass=['X', 'X', 'Y', 'Y', 'Y'], - y=[1, 0, 1, 0, 0])) +df = pd.DataFrame(dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], + workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], + y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] @@ -18,7 +18,7 @@ learner = FastTreesBinaryClassifier() pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) -param_grid = dict(cat__hash_bits=[1, 2, 4, 6, 8, 16], +param_grid = dict(cat__number_of_bits=[1, 2, 4, 6, 8, 16], learner=[ FastLinearBinaryClassifier(), FastTreesBinaryClassifier(), @@ -30,5 +30,5 @@ grid.fit(X, y) print(grid.best_params_['learner'].__class__.__name__) # FastLinearBinaryClassifier -print(grid.best_params_['cat__hash_bits']) -# 1 +print(grid.best_params_['cat__number_of_bits']) +# 2 diff --git a/src/python/nimbusml/examples/TensorFlowScorer.py b/src/python/nimbusml/examples/TensorFlowScorer.py index ef082471..643d2882 100644 --- a/src/python/nimbusml/examples/TensorFlowScorer.py +++ b/src/python/nimbusml/examples/TensorFlowScorer.py @@ -16,7 +16,7 @@ data.head() # transform usage xf = TensorFlowScorer( - model=os.path.join(os.path.dirname(__file__), 'frozen_saved_model.pb'), + model_location=os.path.join(os.path.dirname(__file__), 'frozen_saved_model.pb'), columns={'c': ['a', 'b']} ) diff --git a/src/python/nimbusml/examples/WordEmbedding.py b/src/python/nimbusml/examples/WordEmbedding.py index 569aca12..1f53c15d 100644 --- a/src/python/nimbusml/examples/WordEmbedding.py +++ b/src/python/nimbusml/examples/WordEmbedding.py @@ -19,7 +19,7 @@ # transform usage pipeline = Pipeline([ - NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, + NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='ngram_TransformedText', columns={'ngram': ['SentimentText']}), WordEmbedding(columns='ngram_TransformedText') diff --git a/src/python/nimbusml/examples/examples_from_dataframe/ColumnConcatenator_df.py b/src/python/nimbusml/examples/examples_from_dataframe/ColumnConcatenator_df.py index 61005ee4..1ad44821 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/ColumnConcatenator_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/ColumnConcatenator_df.py @@ -1,6 +1,7 @@ ############################################################################### # ColumnConcatenator import numpy as np +import pandas as pd from nimbusml import Pipeline, Role from nimbusml.datasets import get_dataset from nimbusml.linear_model import LogisticRegressionClassifier @@ -31,7 +32,6 @@ # TODO: fix as_matrix() requirement pipeline.fit(X_train, y_train) -scores = pipeline.predict(X_test) -print(scores) # Evaluate the model -print('Accuracy:', np.mean(y_test == [i for i in scores])) +metrics, scores = pipeline.test(X_test, y_test, output_scores=True) +print(metrics) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/FastLinearClassifier_iris_df.py b/src/python/nimbusml/examples/examples_from_dataframe/FastLinearClassifier_iris_df.py index 63de617d..7ab64614 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/FastLinearClassifier_iris_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/FastLinearClassifier_iris_df.py @@ -1,6 +1,7 @@ ############################################################################### # FastLinearClassifier import numpy as np +import pandas as pd from nimbusml.datasets import get_dataset from nimbusml.linear_model import FastLinearClassifier from sklearn.model_selection import train_test_split @@ -19,6 +20,7 @@ lr = FastLinearClassifier().fit(X_train, y_train) scores = lr.predict(X_test) +scores = pd.to_numeric(scores) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores])) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/FromKey_df.py b/src/python/nimbusml/examples/examples_from_dataframe/FromKey_df.py index d4a86d54..176b7020 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/FromKey_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/FromKey_df.py @@ -16,5 +16,5 @@ tokey = ToKey(columns='text') y = tokey.fit_transform(categorical_df) -y2 = fromkey.fit_transform(y) +y2 = fromkey.clone().fit_transform(y) print(y2['text'] == categorical_df['text']) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/LightGbmClassifier_iris_df.py b/src/python/nimbusml/examples/examples_from_dataframe/LightGbmClassifier_iris_df.py index d0245a2c..f2534479 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/LightGbmClassifier_iris_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/LightGbmClassifier_iris_df.py @@ -1,17 +1,20 @@ ############################################################################### # LightGbmClassifier import numpy as np +import pandas as pd from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmClassifier from sklearn.model_selection import train_test_split +np.random.seed(0) + # use 'iris' data set to create test and train data +df = get_dataset("iris").as_df() +print(df.head()) # Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa # 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 -np.random.seed(0) -df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) X_train, X_test, y_train, y_test = \ @@ -19,6 +22,7 @@ lr = LightGbmClassifier().fit(X_train, y_train) scores = lr.predict(X_test) +scores = pd.to_numeric(scores) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores])) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/LogisticRegressionClassifier_iris_df.py b/src/python/nimbusml/examples/examples_from_dataframe/LogisticRegressionClassifier_iris_df.py index 73127743..691e4dd3 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/LogisticRegressionClassifier_iris_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/LogisticRegressionClassifier_iris_df.py @@ -1,6 +1,7 @@ ############################################################################### # LogisticRegressionClassifier import numpy as np +import pandas as pd from nimbusml.datasets import get_dataset from nimbusml.linear_model import LogisticRegressionClassifier from sklearn.model_selection import train_test_split @@ -19,6 +20,7 @@ lr = LogisticRegressionClassifier().fit(X_train, y_train) scores = lr.predict(X_test) +scores = pd.to_numeric(scores) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores])) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py index aa9a65ab..e87b8168 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py @@ -81,7 +81,7 @@ X = ngram.fit_transform(X) # view the transformed numerical values and column names -print(X) +# print(X.head()) mymodel = LogisticRegressionBinaryClassifier().fit(X, y) @@ -90,4 +90,4 @@ scores = mymodel.predict(ngram.transform(test_reviews)) # view the scores -print(scores) +# print(scores.head()) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/NaiveBayesClassifier_df.py b/src/python/nimbusml/examples/examples_from_dataframe/NaiveBayesClassifier_df.py index d0cff5f3..49b67af4 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/NaiveBayesClassifier_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/NaiveBayesClassifier_df.py @@ -1,6 +1,7 @@ ############################################################################### # NaiveBayesClassifier import numpy as np +import pandas as pd from nimbusml import Pipeline from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer @@ -26,10 +27,9 @@ nb = NaiveBayesClassifier(feature=['SentimentText']) ppl = Pipeline([texttransform, nb]) - ppl.fit(X_train, y_train) -scores = ppl.predict(X_test)['PredictedLabel'] - # evaluate the model -print('Accuracy:', np.mean(y_test == [i for i in scores])) +metrics, scores = ppl.test(X_test, y_test, output_scores=True) + +print(metrics) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/OneHotHashVectorizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/OneHotHashVectorizer_df.py index 5df9bd78..606ba878 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/OneHotHashVectorizer_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/OneHotHashVectorizer_df.py @@ -75,12 +75,12 @@ # OneHotHashVectorizer transform: the entire string is treated as a category. # if output column name is same as input column, original input column values -# are replaced. hash_bits=6 will hash into 2^6 -1 dimensions +# are replaced. number_of_bits=6 will hash into 2^6 -1 dimensions y = train_reviews['like'] X = train_reviews.loc[:, train_reviews.columns != 'like'] -cat = OneHotHashVectorizer(hash_bits=6) << 'review' +cat = OneHotHashVectorizer(number_of_bits=6) << 'review' X = cat.fit_transform(X) # view the transformed numerical values and column names diff --git a/src/python/nimbusml/examples/examples_from_dataframe/PcaTransformer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/PcaTransformer_df.py index c4bd1d8c..0ee52495 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/PcaTransformer_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/PcaTransformer_df.py @@ -31,7 +31,7 @@ for rank in range(len(X), 2, -1): print('Number of dimensions=', rank) pipe = Pipeline([ - ColumnConcatenator() << {'X': X}, # X is VectorType column + ColumnConcatenator() << {'X': X}, # X is VectorDataViewType column PcaTransformer(rank=rank) << 'X', # find principal components of X LightGbmBinaryClassifier() ]) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py index 320eaa6d..9a4eba53 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py @@ -17,10 +17,10 @@ "Never visit again... rascals!"])) pipeline = Pipeline([ - NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True), + NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='review_TransformedText'), WordEmbedding() << 'review_TransformedText' ]) y = pipeline.fit_transform(customer_reviews) # view the review embeddings -print(y) +# print(y.head()) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/__init__.py b/src/python/nimbusml/examples/examples_from_dataframe/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/feature_extraction/categorical/onehothashvectorizer.py b/src/python/nimbusml/feature_extraction/categorical/onehothashvectorizer.py index 501ac7b8..f8da6b5b 100644 --- a/src/python/nimbusml/feature_extraction/categorical/onehothashvectorizer.py +++ b/src/python/nimbusml/feature_extraction/categorical/onehothashvectorizer.py @@ -54,7 +54,7 @@ class OneHotHashVectorizer(core, BaseTransform, TransformerMixin): For more details see `Columns `_. - :param hash_bits: An integer specifying the number of bits to hash into. + :param number_of_bits: An integer specifying the number of bits to hash into. Must be between 1 and 30, inclusive. The default value is 16. :param output_kind: A character string that specifies the kind @@ -86,7 +86,7 @@ class OneHotHashVectorizer(core, BaseTransform, TransformerMixin): :param ordered: ``True`` to include the position of each term in the hash. Otherwise, ``False``. The default value is ``True``. - :param invert_hash: An integer specifying the limit on the number of keys + :param maximum_number_of_inverts: An integer specifying the limit on the number of keys that can be used to generate the slot name. ``0`` means no invert hashing; ``-1`` means no limit. While a zero value gives better performance, a non-zero value is needed to get meaningful coefficent @@ -109,11 +109,11 @@ class OneHotHashVectorizer(core, BaseTransform, TransformerMixin): @trace def __init__( self, - hash_bits=16, + number_of_bits=16, output_kind='Bag', random_state=314489979, ordered=True, - invert_hash=0, + maximum_number_of_inverts=0, columns=None, **params): @@ -122,11 +122,11 @@ def __init__( BaseTransform.__init__(self, **params) core.__init__( self, - hash_bits=hash_bits, + number_of_bits=number_of_bits, output_kind=output_kind, random_state=random_state, ordered=ordered, - invert_hash=invert_hash, + maximum_number_of_inverts=maximum_number_of_inverts, **params) self._columns = columns diff --git a/src/python/nimbusml/feature_extraction/categorical/onehotvectorizer.py b/src/python/nimbusml/feature_extraction/categorical/onehotvectorizer.py index bca0fa5b..9b5ef5b6 100644 --- a/src/python/nimbusml/feature_extraction/categorical/onehotvectorizer.py +++ b/src/python/nimbusml/feature_extraction/categorical/onehotvectorizer.py @@ -115,9 +115,9 @@ class OneHotVectorizer(core, BaseTransform, TransformerMixin): def __init__( self, max_num_terms=1000000, - output_kind='Ind', + output_kind='Indicator', term=None, - sort='Occurrence', + sort='ByOccurrence', text_key_values=True, columns=None, **params): diff --git a/src/python/nimbusml/feature_extraction/image/pixelextractor.py b/src/python/nimbusml/feature_extraction/image/pixelextractor.py index 89219e4c..3697ad45 100644 --- a/src/python/nimbusml/feature_extraction/image/pixelextractor.py +++ b/src/python/nimbusml/feature_extraction/image/pixelextractor.py @@ -62,7 +62,9 @@ class PixelExtractor(core, BaseTransform, TransformerMixin): :param use_blue: Specifies whether to use blue channel. The default value is ``True``. - :param interleave_argb: Whether to separate each channel or + :param order: Order of colors. + + :param interleave: Whether to separate each channel or interleave in ARGB order. This might be important, for example, if you are training a convolutional neural network, since this would affect the shape of @@ -99,7 +101,8 @@ def __init__( use_red=True, use_green=True, use_blue=True, - interleave_argb=False, + order='ARGB', + interleave=False, convert=True, offset=None, scale=None, @@ -115,7 +118,8 @@ def __init__( use_red=use_red, use_green=use_green, use_blue=use_blue, - interleave_argb=interleave_argb, + order=order, + interleave=interleave, convert=convert, offset=offset, scale=scale, diff --git a/src/python/nimbusml/feature_extraction/text/extractor/ngram.py b/src/python/nimbusml/feature_extraction/text/extractor/ngram.py index 8b40e117..9ec1858f 100644 --- a/src/python/nimbusml/feature_extraction/text/extractor/ngram.py +++ b/src/python/nimbusml/feature_extraction/text/extractor/ngram.py @@ -58,12 +58,12 @@ class Ngram(core): :param ngram_length: Ngram length. :param skip_length: Maximum number of tokens to skip when constructing an - ngram. + n-gram. - :param all_lengths: Whether to include all ngram lengths up to NgramLength + :param all_lengths: Whether to include all n-gram lengths up to NgramLength or only NgramLength. - :param max_num_terms: Maximum number of ngrams to store in the dictionary. + :param max_num_terms: Maximum number of n-grams to store in the dictionary. :param weighting: The weighting criteria. diff --git a/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py b/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py index 5a79b890..2f373a31 100644 --- a/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py +++ b/src/python/nimbusml/feature_extraction/text/extractor/ngramhash.py @@ -58,15 +58,15 @@ class NgramHash(core): * *term frequency-inverse document frequency* - the product term frequency and the inverse document frequency. - :param hash_bits: Number of bits to hash into. Must be between 1 and 30, - inclusive. + :param number_of_bits: Number of bits to hash into. Must be between 1 and + 30, inclusive. :param ngram_length: Ngram length. :param skip_length: Maximum number of tokens to skip when constructing an - ngram. + n-gram. - :param all_lengths: Whether to include all ngram lengths up to ngramLength + :param all_lengths: Whether to include all n-gram lengths up to ngramLength or only ngramLength. :param seed: Hashing seed. @@ -74,8 +74,9 @@ class NgramHash(core): :param ordered: Whether the position of each source column should be included in the hash (when there are multiple source columns). - :param invert_hash: Limit the number of keys used to generate the slot name - to this many. 0 means no invert hashing, -1 means no limit. + :param maximum_number_of_inverts: Limit the number of keys used to generate + the slot name to this many. 0 means no invert hashing, -1 means no + limit. :param params: Additional arguments sent to compute engine. @@ -94,23 +95,23 @@ class NgramHash(core): @trace def __init__( self, - hash_bits=16, + number_of_bits=16, ngram_length=1, skip_length=0, all_lengths=True, seed=314489979, ordered=True, - invert_hash=0, + maximum_number_of_inverts=0, **params): core.__init__( self, - hash_bits=hash_bits, + number_of_bits=number_of_bits, ngram_length=ngram_length, skip_length=skip_length, all_lengths=all_lengths, seed=seed, ordered=ordered, - invert_hash=invert_hash, + maximum_number_of_inverts=maximum_number_of_inverts, **params) def get_params(self, deep=False): diff --git a/src/python/nimbusml/feature_extraction/text/lightlda.py b/src/python/nimbusml/feature_extraction/text/lightlda.py index ec016d5d..271f90c7 100644 --- a/src/python/nimbusml/feature_extraction/text/lightlda.py +++ b/src/python/nimbusml/feature_extraction/text/lightlda.py @@ -47,8 +47,8 @@ class LightLda(core, BaseTransform, TransformerMixin): :param num_topic: The number of topics. - :param train_threads: The number of training threads. Default value depends - on number of logical processors. + :param number_of_threads: The number of training threads. Default value + depends on number of logical processors. :param num_max_doc_token: The threshold of maximum count of tokens per doc. @@ -95,7 +95,7 @@ class LightLda(core, BaseTransform, TransformerMixin): def __init__( self, num_topic=100, - train_threads=0, + number_of_threads=0, num_max_doc_token=512, alpha_sum=100.0, beta=0.01, @@ -115,7 +115,7 @@ def __init__( core.__init__( self, num_topic=num_topic, - train_threads=train_threads, + number_of_threads=number_of_threads, num_max_doc_token=num_max_doc_token, alpha_sum=alpha_sum, beta=beta, diff --git a/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py b/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py index b2413fa0..92a3be2a 100644 --- a/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py +++ b/src/python/nimbusml/feature_extraction/text/ngramfeaturizer.py @@ -100,7 +100,22 @@ class NGramFeaturizer(core, BaseTransform, TransformerMixin): * ``"Spanish"`` * ``"Japanese"``. - :param use_predefined_stop_word_remover: Use stop remover or not. + :param stop_words_remover: Specifies the stopwords remover to use. There + are + three options supported: + + * `None`: No stopwords remover is used. + * :py:class:`PredefinedStopWordsRemover + ` : + A precompiled language-specific lists + of stop words is used that includes the most common words from + Microsoft Office. + * :py:class:`CustomStopWordsRemover + ` : A + user-defined list of stopwords. It accepts + the following option: ``stopword``. + + The default value is `None`. :param text_case: Text casing using the rules of the invariant culture. Takes the @@ -122,8 +137,8 @@ class NGramFeaturizer(core, BaseTransform, TransformerMixin): :param keep_numbers: ``False`` to remove numbers; ``True`` to retain numbers. The default value is ``True``. - :param output_tokens: Whether to output the transformed text tokens as an - additional column. + :param output_tokens_column_name: Column containing the transformed text + tokens. :param dictionary: A dictionary of whitelisted terms which accepts the following options: @@ -203,12 +218,12 @@ class NGramFeaturizer(core, BaseTransform, TransformerMixin): def __init__( self, language='English', - use_predefined_stop_word_remover=False, + stop_words_remover=None, text_case='Lower', keep_diacritics=False, keep_punctuations=True, keep_numbers=True, - output_tokens=False, + output_tokens_column_name=None, dictionary=None, word_feature_extractor=Ngram( max_num_terms=[10000000]), @@ -226,12 +241,12 @@ def __init__( core.__init__( self, language=language, - use_predefined_stop_word_remover=use_predefined_stop_word_remover, + stop_words_remover=stop_words_remover, text_case=text_case, keep_diacritics=keep_diacritics, keep_punctuations=keep_punctuations, keep_numbers=keep_numbers, - output_tokens=output_tokens, + output_tokens_column_name=output_tokens_column_name, dictionary=dictionary, word_feature_extractor=word_feature_extractor, char_feature_extractor=char_feature_extractor, diff --git a/src/python/nimbusml/feature_extraction/text/wordembedding.py b/src/python/nimbusml/feature_extraction/text/wordembedding.py index 452c735e..ad467ce1 100644 --- a/src/python/nimbusml/feature_extraction/text/wordembedding.py +++ b/src/python/nimbusml/feature_extraction/text/wordembedding.py @@ -58,7 +58,7 @@ class WordEmbedding(core, BaseTransform, TransformerMixin): Available options are: 'GloVe50D', 'GloVe100D', 'GloVe200D', 'GloVe300D', 'GloVeTwitter25D', 'GloVeTwitter50D', 'GloVeTwitter100D', 'GloVeTwitter200D', 'FastTextWikipedia300D', - 'Sswe'. + 'SentimentSpecificWordEmbedding'. :param custom_lookup_table: Filename for custom word embedding model. @@ -70,10 +70,9 @@ class WordEmbedding(core, BaseTransform, TransformerMixin): <'This', 'is', 'good'>, users need to create an input column by: * concatenating columns with TX type, - * or using the ``output_tokens=True`` for ``NGramFeaturizer()`` to + * or using the ``output_tokens_column_name`` for ``NGramFeaturizer()`` to convert a column with sentences like "This is good" into <'This', - 'is', 'good'>. The column for the output token column is renamed with - a prefix of '_TranformedText'. + 'is', 'good'>. In the following example, after the ``NGramFeaturizer``, features @@ -105,7 +104,7 @@ class WordEmbedding(core, BaseTransform, TransformerMixin): @trace def __init__( self, - model_kind='Sswe', + model_kind='SentimentSpecificWordEmbedding', custom_lookup_table=None, columns=None, **params): diff --git a/src/python/nimbusml/feature_selection/mutualinformationselector.py b/src/python/nimbusml/feature_selection/mutualinformationselector.py index a8837293..cbd066e7 100644 --- a/src/python/nimbusml/feature_selection/mutualinformationselector.py +++ b/src/python/nimbusml/feature_selection/mutualinformationselector.py @@ -111,11 +111,11 @@ def __init__( columns=None, **params): - if 'label_column' in params: + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label if columns: params['columns'] = columns BaseTransform.__init__(self, **params) diff --git a/src/python/nimbusml/internal/core/base_pipeline_item.py b/src/python/nimbusml/internal/core/base_pipeline_item.py index c29724e6..b2daf9ad 100644 --- a/src/python/nimbusml/internal/core/base_pipeline_item.py +++ b/src/python/nimbusml/internal/core/base_pipeline_item.py @@ -248,7 +248,7 @@ class BasePipelineItem(): def __init__(self, type=None, random_state=None, **params): # The consctuctor is usually called twice. # First time from BaseSomething like BaseTransform. - # Second from interal classes. + # Second from internal classes. if hasattr(self, '_BasePipelineItem_already_called'): return self._BasePipelineItem_already_called = True @@ -485,7 +485,7 @@ def _check_roles(self): # current code makes it difficult to guess. # A minor modification in entrypoints.py should do the # trick. - if self.type != "clusterer": + if self.type not in {"clusterer", "anomaly"} : warnings.warn( "Model '{0}' (type='{1}') does not support " "role '{2}' (for developers, check " @@ -771,23 +771,23 @@ def set_inputs(self, inp, early=False): # Needed for learner. % is also used to define feature roles. if self.type in {'classifier', 'regressor', 'ranker', 'clustering', 'anomaly'}: - self.feature_column = getattr(self, attr) - if not isinstance(self.feature_column, (str, tuple)): - if isinstance(self.feature_column, list): - if len(self.feature_column) == 1: - self.feature_column = self.feature_column[0] + self.feature_column_name = getattr(self, attr) + if not isinstance(self.feature_column_name, (str, tuple)): + if isinstance(self.feature_column_name, list): + if len(self.feature_column_name) == 1: + self.feature_column_name = self.feature_column_name[0] else: # Experiment will merge them. # raise RuntimeError("Too many feature columns. # Use ConcatTransform to merge them: " # " ConcatTransform() % {0} > - # Role.Feature".format(self.feature_column)) + # Role.Feature".format(self.feature_column_name)) pass else: raise TypeError( "Feature column type is unexpected: {0}".format( type( - self.feature_column))) + self.feature_column_name))) self._attr_input = attr self._check_inputs() diff --git a/src/python/nimbusml/internal/core/cluster/kmeansplusplus.py b/src/python/nimbusml/internal/core/cluster/kmeansplusplus.py index b3e8f8fa..f7e34820 100644 --- a/src/python/nimbusml/internal/core/cluster/kmeansplusplus.py +++ b/src/python/nimbusml/internal/core/cluster/kmeansplusplus.py @@ -61,19 +61,19 @@ class KMeansPlusPlus(BasePipelineItem, DefaultSignatureWithRoles): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param n_clusters: The number of clusters. - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. - :param init_algorithm: Cluster initialization algorithm. + :param initialization_algorithm: Cluster initialization algorithm. :param opt_tol: Tolerance parameter for trainer convergence. Low = slower, more accurate. - :param max_iterations: Maximum number of iterations. + :param maximum_number_of_iterations: Maximum number of iterations. :param accel_mem_budget_mb: Memory budget (in MBs) to use for KMeans acceleration. @@ -99,10 +99,10 @@ def __init__( normalize='Auto', caching='Auto', n_clusters=5, - train_threads=None, - init_algorithm='KMeansParallel', + number_of_threads=None, + initialization_algorithm='KMeansYinyang', opt_tol=1e-07, - max_iterations=1000, + maximum_number_of_iterations=1000, accel_mem_budget_mb=4096, **params): BasePipelineItem.__init__( @@ -111,10 +111,10 @@ def __init__( self.normalize = normalize self.caching = caching self.n_clusters = n_clusters - self.train_threads = train_threads - self.init_algorithm = init_algorithm + self.number_of_threads = number_of_threads + self.initialization_algorithm = initialization_algorithm self.opt_tol = opt_tol - self.max_iterations = max_iterations + self.maximum_number_of_iterations = maximum_number_of_iterations self.accel_mem_budget_mb = accel_mem_budget_mb @property @@ -124,19 +124,19 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - weight_column=self._getattr_role( - 'weight_column', + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', all_args), normalize_features=self.normalize, caching=self.caching, k=self.n_clusters, - num_threads=self.train_threads, - init_algorithm=self.init_algorithm, + number_of_threads=self.number_of_threads, + initialization_algorithm=self.initialization_algorithm, opt_tol=self.opt_tol, - max_iterations=self.max_iterations, + maximum_number_of_iterations=self.maximum_number_of_iterations, accel_mem_budget_mb=self.accel_mem_budget_mb) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py b/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py index f0a7b9a5..c54f353b 100644 --- a/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py +++ b/src/python/nimbusml/internal/core/decomposition/factorizationmachinebinaryclassifier.py @@ -48,42 +48,30 @@ class FactorizationMachineBinaryClassifier( `_ - :param learning_rate: Initial learning rate. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. - :param iters: Number of training iterations. + :param number_of_iterations: Number of training iterations. - :param latent_dim: Latent space dimension. + :param latent_dimension: Latent space dimension. :param lambda_linear: Regularization coefficient of linear weights. :param lambda_latent: Regularization coefficient of latent weights. - :param normalize: Specifies the type of automatic normalization used: - - * ``"Auto"``: if normalization is needed, it is performed - automatically. This is the default choice. - * ``"No"``: no normalization is performed. - * ``"Yes"``: normalization is performed. - * ``"Warn"``: if normalization is needed, a warning - message is displayed, but normalization is not performed. - - Normalization rescales disparate data ranges to a standard scale. - Feature - scaling insures the distances between data points are proportional - and - enables various optimization methods such as gradient descent to - converge - much faster. If normalization is performed, a ``MaxMin`` normalizer - is - used. It normalizes values in an interval [a, b] where ``-1 <= a <= - 0`` - and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves - sparsity by mapping zero to zero. - - :param norm: Whether to normalize the input vectors so that the + :param normalize: Whether to normalize the input vectors so that the concatenation of all fields' feature vectors is unit-length. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. + + :param extra_feature_columns: Extra columns to use for feature vectors. The + i-th specified string denotes the column containing features form the + (i+1)-th field. Note that the first field is specified by "feat" + instead of "exfeat". :param shuffle: Whether to shuffle for each training iteration. @@ -113,13 +101,13 @@ class FactorizationMachineBinaryClassifier( def __init__( self, learning_rate=0.1, - iters=5, - latent_dim=20, + number_of_iterations=5, + latent_dimension=20, lambda_linear=0.0001, lambda_latent=0.0001, - normalize='Auto', - norm=True, + normalize=True, caching='Auto', + extra_feature_columns=None, shuffle=True, verbose=True, radius=0.5, @@ -128,13 +116,13 @@ def __init__( self, type='classifier', **params) self.learning_rate = learning_rate - self.iters = iters - self.latent_dim = latent_dim + self.number_of_iterations = number_of_iterations + self.latent_dimension = latent_dimension self.lambda_linear = lambda_linear self.lambda_latent = lambda_latent self.normalize = normalize - self.norm = norm self.caching = caching + self.extra_feature_columns = extra_feature_columns self.shuffle = shuffle self.verbose = verbose self.radius = radius @@ -146,20 +134,23 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', all_args), learning_rate=self.learning_rate, - iters=self.iters, - latent_dim=self.latent_dim, + number_of_iterations=self.number_of_iterations, + latent_dimension=self.latent_dimension, lambda_linear=self.lambda_linear, lambda_latent=self.lambda_latent, normalize_features=self.normalize, - norm=self.norm, caching=self.caching, + extra_feature_columns=self.extra_feature_columns, shuffle=self.shuffle, verbose=self.verbose, radius=self.radius) diff --git a/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py b/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py index 08da4e08..728a7132 100644 --- a/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py +++ b/src/python/nimbusml/internal/core/decomposition/pcaanomalydetector.py @@ -88,7 +88,7 @@ class PcaAnomalyDetector( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param rank: The number of components in the PCA. @@ -137,11 +137,11 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - weight_column=self._getattr_role( - 'weight_column', + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', all_args), normalize_features=self.normalize, caching=self.caching, diff --git a/src/python/nimbusml/internal/core/decomposition/pcatransformer.py b/src/python/nimbusml/internal/core/decomposition/pcatransformer.py index aaf4d060..f013429f 100644 --- a/src/python/nimbusml/internal/core/decomposition/pcatransformer.py +++ b/src/python/nimbusml/internal/core/decomposition/pcatransformer.py @@ -139,8 +139,8 @@ def _get_node(self, **all_args): o in zip( input_columns, output_columns)] if input_columns else None, - weight_column=self._getattr_role( - 'weight_column', + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', all_args), rank=self.rank, oversampling=self.oversampling, diff --git a/src/python/nimbusml/internal/core/ensemble/booster/dart.py b/src/python/nimbusml/internal/core/ensemble/booster/dart.py index 8607e252..dd4418d3 100644 --- a/src/python/nimbusml/internal/core/ensemble/booster/dart.py +++ b/src/python/nimbusml/internal/core/ensemble/booster/dart.py @@ -36,53 +36,51 @@ class Dart(Component): `_ - :param drop_rate: Drop ratio for trees. Range:(0,1). + :param tree_drop_fraction: The drop ratio for trees. Range:(0,1). - :param max_drop: Max number of dropped tree in a boosting round. + :param maximum_number_of_dropped_trees_per_round: Maximum number of dropped + trees in a boosting round. - :param skip_drop: Probability for not perform dropping in a boosting round. + :param skip_drop_fraction: Probability for not dropping in a boosting + round. :param xgboost_dart_mode: True will enable xgboost dart mode. :param uniform_drop: True will enable uniform drop. - :param unbalanced_sets: Use for binary classification when classes are not - balanced. + :param minimum_split_gain: Minimum loss reduction required to make a + further partition on a leaf node of the tree. the larger, the more + conservative the algorithm will be. - :param min_split_gain: Minimum loss reduction required to make a further - partition on a leaf node of the tree. the larger, the more conservative - the algorithm will be. + :param maximum_tree_depth: Maximum depth of a tree. 0 means no limit. + However, tree still grows by best-first. - :param max_depth: Maximum depth of a tree. 0 means no limit. However, tree - still grows by best-first. - - :param min_child_weight: Minimum sum of instance weight(hessian) needed in - a child. If the tree partition step results in a leaf node with the sum - of instance weight less than min_child_weight, then the building + :param minimum_child_weight: Minimum sum of instance weight(hessian) needed + in a child. If the tree partition step results in a leaf node with the + sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. - :param subsample_freq: Subsample frequency. 0 means no subsample. If - subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And - the subset will be updated on every Subsample iteratinos. + :param subsample_frequency: Subsample frequency for bagging. 0 means no + subsample. Specifies the frequency at which the bagging occurs, where + if this is set to N, the subsampling will happen at every N + iterations.This must be set with Subsample as this specifies the amount + to subsample. - :param subsample: Subsample ratio of the training instance. Setting it to - 0.5 means that LightGBM randomly collected half of the data instances - to grow trees and this will prevent overfitting. Range: (0,1]. + :param subsample_fraction: Subsample ratio of the training instance. + Setting it to 0.5 means that LightGBM randomly collected half of the + data instances to grow trees and this will prevent overfitting. Range: + (0,1]. :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. - :param reg_lambda: L2 regularization term on weights, increasing this value - will make model more conservative. - - :param reg_alpha: L1 regularization term on weights, increase this value - will make model more conservative. + :param l2_regularization: L2 regularization term on weights, increasing + this value will make model more conservative. - :param scale_pos_weight: Control the balance of positive and negative - weights, useful for unbalanced classes. A typical value to consider: - sum(negative cases) / sum(positive cases). + :param l1_regularization: L1 regularization term on weights, increase this + value will make model more conservative. :param params: Additional arguments sent to compute engine. @@ -105,61 +103,54 @@ class Dart(Component): @trace def __init__( self, - drop_rate=0.1, - max_drop=1, - skip_drop=0.5, + tree_drop_fraction=0.1, + maximum_number_of_dropped_trees_per_round=1, + skip_drop_fraction=0.5, xgboost_dart_mode=False, uniform_drop=False, - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): - self.drop_rate = drop_rate - self.max_drop = max_drop - self.skip_drop = skip_drop + self.tree_drop_fraction = tree_drop_fraction + self.maximum_number_of_dropped_trees_per_round = maximum_number_of_dropped_trees_per_round + self.skip_drop_fraction = skip_drop_fraction self.xgboost_dart_mode = xgboost_dart_mode self.uniform_drop = uniform_drop - self.unbalanced_sets = unbalanced_sets - self.min_split_gain = min_split_gain - self.max_depth = max_depth - self.min_child_weight = min_child_weight - self.subsample_freq = subsample_freq - self.subsample = subsample + self.minimum_split_gain = minimum_split_gain + self.maximum_tree_depth = maximum_tree_depth + self.minimum_child_weight = minimum_child_weight + self.subsample_frequency = subsample_frequency + self.subsample_fraction = subsample_fraction self.feature_fraction = feature_fraction - self.reg_lambda = reg_lambda - self.reg_alpha = reg_alpha - self.scale_pos_weight = scale_pos_weight + self.l2_regularization = l2_regularization + self.l1_regularization = l1_regularization self.kind = 'BoosterParameterFunction' self.name = 'dart' self.settings = {} - if drop_rate is not None: - self.settings['DropRate'] = try_set( - obj=drop_rate, + if tree_drop_fraction is not None: + self.settings['TreeDropFraction'] = try_set( + obj=tree_drop_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if max_drop is not None: - self.settings['MaxDrop'] = try_set( - obj=max_drop, + if maximum_number_of_dropped_trees_per_round is not None: + self.settings['MaximumNumberOfDroppedTreesPerRound'] = try_set( + obj=maximum_number_of_dropped_trees_per_round, none_acceptable=True, - is_of_type=numbers.Real, - valid_range={ - 'Inf': 0, - 'Max': 2147483647}) - if skip_drop is not None: - self.settings['SkipDrop'] = try_set( - obj=skip_drop, + is_of_type=numbers.Real, valid_range={'Inf': 0, 'Max': 2147483647}) + if skip_drop_fraction is not None: + self.settings['SkipDropFraction'] = try_set( + obj=skip_drop_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ @@ -171,38 +162,35 @@ def __init__( if uniform_drop is not None: self.settings['UniformDrop'] = try_set( obj=uniform_drop, none_acceptable=True, is_of_type=bool) - if unbalanced_sets is not None: - self.settings['UnbalancedSets'] = try_set( - obj=unbalanced_sets, none_acceptable=True, is_of_type=bool) - if min_split_gain is not None: - self.settings['MinSplitGain'] = try_set( - obj=min_split_gain, + if minimum_split_gain is not None: + self.settings['MinimumSplitGain'] = try_set( + obj=minimum_split_gain, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if max_depth is not None: - self.settings['MaxDepth'] = try_set( - obj=max_depth, + if maximum_tree_depth is not None: + self.settings['MaximumTreeDepth'] = try_set( + obj=maximum_tree_depth, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if min_child_weight is not None: - self.settings['MinChildWeight'] = try_set( - obj=min_child_weight, + if minimum_child_weight is not None: + self.settings['MinimumChildWeight'] = try_set( + obj=minimum_child_weight, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if subsample_freq is not None: - self.settings['SubsampleFreq'] = try_set( - obj=subsample_freq, + if subsample_frequency is not None: + self.settings['SubsampleFrequency'] = try_set( + obj=subsample_frequency, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if subsample is not None: - self.settings['Subsample'] = try_set( - obj=subsample, + if subsample_fraction is not None: + self.settings['SubsampleFraction'] = try_set( + obj=subsample_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ @@ -216,21 +204,16 @@ def __init__( valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if reg_lambda is not None: - self.settings['RegLambda'] = try_set( - obj=reg_lambda, + if l2_regularization is not None: + self.settings['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if reg_alpha is not None: - self.settings['RegAlpha'] = try_set( - obj=reg_alpha, + if l1_regularization is not None: + self.settings['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if scale_pos_weight is not None: - self.settings['ScalePosWeight'] = try_set( - obj=scale_pos_weight, - none_acceptable=True, - is_of_type=numbers.Real) super( Dart, diff --git a/src/python/nimbusml/internal/core/ensemble/booster/gbdt.py b/src/python/nimbusml/internal/core/ensemble/booster/gbdt.py index 4a42bc82..e165d465 100644 --- a/src/python/nimbusml/internal/core/ensemble/booster/gbdt.py +++ b/src/python/nimbusml/internal/core/ensemble/booster/gbdt.py @@ -20,43 +20,39 @@ class Gbdt(Component): Traditional Gradient Boosting Decision Tree. - :param unbalanced_sets: Use for binary classification when classes are not - balanced. + :param minimum_split_gain: Minimum loss reduction required to make a + further partition on a leaf node of the tree. the larger, the more + conservative the algorithm will be. - :param min_split_gain: Minimum loss reduction required to make a further - partition on a leaf node of the tree. the larger, the more conservative - the algorithm will be. + :param maximum_tree_depth: Maximum depth of a tree. 0 means no limit. + However, tree still grows by best-first. - :param max_depth: Maximum depth of a tree. 0 means no limit. However, tree - still grows by best-first. - - :param min_child_weight: Minimum sum of instance weight(hessian) needed in - a child. If the tree partition step results in a leaf node with the sum - of instance weight less than min_child_weight, then the building + :param minimum_child_weight: Minimum sum of instance weight(hessian) needed + in a child. If the tree partition step results in a leaf node with the + sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. - :param subsample_freq: Subsample frequency. 0 means no subsample. If - subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And - the subset will be updated on every Subsample iteratinos. + :param subsample_frequency: Subsample frequency for bagging. 0 means no + subsample. Specifies the frequency at which the bagging occurs, where + if this is set to N, the subsampling will happen at every N + iterations.This must be set with Subsample as this specifies the amount + to subsample. - :param subsample: Subsample ratio of the training instance. Setting it to - 0.5 means that LightGBM randomly collected half of the data instances - to grow trees and this will prevent overfitting. Range: (0,1]. + :param subsample_fraction: Subsample ratio of the training instance. + Setting it to 0.5 means that LightGBM randomly collected half of the + data instances to grow trees and this will prevent overfitting. Range: + (0,1]. :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. - :param reg_lambda: L2 regularization term on weights, increasing this value - will make model more conservative. - - :param reg_alpha: L1 regularization term on weights, increase this value - will make model more conservative. + :param l2_regularization: L2 regularization term on weights, increasing + this value will make model more conservative. - :param scale_pos_weight: Control the balance of positive and negative - weights, useful for unbalanced classes. A typical value to consider: - sum(negative cases) / sum(positive cases). + :param l1_regularization: L1 regularization term on weights, increase this + value will make model more conservative. :param params: Additional arguments sent to compute engine. @@ -79,64 +75,57 @@ class Gbdt(Component): @trace def __init__( self, - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): - self.unbalanced_sets = unbalanced_sets - self.min_split_gain = min_split_gain - self.max_depth = max_depth - self.min_child_weight = min_child_weight - self.subsample_freq = subsample_freq - self.subsample = subsample + self.minimum_split_gain = minimum_split_gain + self.maximum_tree_depth = maximum_tree_depth + self.minimum_child_weight = minimum_child_weight + self.subsample_frequency = subsample_frequency + self.subsample_fraction = subsample_fraction self.feature_fraction = feature_fraction - self.reg_lambda = reg_lambda - self.reg_alpha = reg_alpha - self.scale_pos_weight = scale_pos_weight + self.l2_regularization = l2_regularization + self.l1_regularization = l1_regularization self.kind = 'BoosterParameterFunction' self.name = 'gbdt' self.settings = {} - if unbalanced_sets is not None: - self.settings['UnbalancedSets'] = try_set( - obj=unbalanced_sets, none_acceptable=True, is_of_type=bool) - if min_split_gain is not None: - self.settings['MinSplitGain'] = try_set( - obj=min_split_gain, + if minimum_split_gain is not None: + self.settings['MinimumSplitGain'] = try_set( + obj=minimum_split_gain, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if max_depth is not None: - self.settings['MaxDepth'] = try_set( - obj=max_depth, + if maximum_tree_depth is not None: + self.settings['MaximumTreeDepth'] = try_set( + obj=maximum_tree_depth, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if min_child_weight is not None: - self.settings['MinChildWeight'] = try_set( - obj=min_child_weight, + if minimum_child_weight is not None: + self.settings['MinimumChildWeight'] = try_set( + obj=minimum_child_weight, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if subsample_freq is not None: - self.settings['SubsampleFreq'] = try_set( - obj=subsample_freq, + if subsample_frequency is not None: + self.settings['SubsampleFrequency'] = try_set( + obj=subsample_frequency, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if subsample is not None: - self.settings['Subsample'] = try_set( - obj=subsample, + if subsample_fraction is not None: + self.settings['SubsampleFraction'] = try_set( + obj=subsample_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ @@ -150,21 +139,16 @@ def __init__( valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if reg_lambda is not None: - self.settings['RegLambda'] = try_set( - obj=reg_lambda, + if l2_regularization is not None: + self.settings['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if reg_alpha is not None: - self.settings['RegAlpha'] = try_set( - obj=reg_alpha, + if l1_regularization is not None: + self.settings['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if scale_pos_weight is not None: - self.settings['ScalePosWeight'] = try_set( - obj=scale_pos_weight, - none_acceptable=True, - is_of_type=numbers.Real) super( Gbdt, diff --git a/src/python/nimbusml/internal/core/ensemble/booster/goss.py b/src/python/nimbusml/internal/core/ensemble/booster/goss.py index deb02c33..694cb8bf 100644 --- a/src/python/nimbusml/internal/core/ensemble/booster/goss.py +++ b/src/python/nimbusml/internal/core/ensemble/booster/goss.py @@ -41,43 +41,39 @@ class Goss(Component): :param other_rate: Retain ratio for small gradient instances. - :param unbalanced_sets: Use for binary classification when classes are not - balanced. + :param minimum_split_gain: Minimum loss reduction required to make a + further partition on a leaf node of the tree. the larger, the more + conservative the algorithm will be. - :param min_split_gain: Minimum loss reduction required to make a further - partition on a leaf node of the tree. the larger, the more conservative - the algorithm will be. + :param maximum_tree_depth: Maximum depth of a tree. 0 means no limit. + However, tree still grows by best-first. - :param max_depth: Maximum depth of a tree. 0 means no limit. However, tree - still grows by best-first. - - :param min_child_weight: Minimum sum of instance weight(hessian) needed in - a child. If the tree partition step results in a leaf node with the sum - of instance weight less than min_child_weight, then the building + :param minimum_child_weight: Minimum sum of instance weight(hessian) needed + in a child. If the tree partition step results in a leaf node with the + sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. - :param subsample_freq: Subsample frequency. 0 means no subsample. If - subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And - the subset will be updated on every Subsample iteratinos. + :param subsample_frequency: Subsample frequency for bagging. 0 means no + subsample. Specifies the frequency at which the bagging occurs, where + if this is set to N, the subsampling will happen at every N + iterations.This must be set with Subsample as this specifies the amount + to subsample. - :param subsample: Subsample ratio of the training instance. Setting it to - 0.5 means that LightGBM randomly collected half of the data instances - to grow trees and this will prevent overfitting. Range: (0,1]. + :param subsample_fraction: Subsample ratio of the training instance. + Setting it to 0.5 means that LightGBM randomly collected half of the + data instances to grow trees and this will prevent overfitting. Range: + (0,1]. :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. - :param reg_lambda: L2 regularization term on weights, increasing this value - will make model more conservative. - - :param reg_alpha: L1 regularization term on weights, increase this value - will make model more conservative. + :param l2_regularization: L2 regularization term on weights, increasing + this value will make model more conservative. - :param scale_pos_weight: Control the balance of positive and negative - weights, useful for unbalanced classes. A typical value to consider: - sum(negative cases) / sum(positive cases). + :param l1_regularization: L1 regularization term on weights, increase this + value will make model more conservative. :param params: Additional arguments sent to compute engine. @@ -102,30 +98,26 @@ def __init__( self, top_rate=0.2, other_rate=0.1, - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): self.top_rate = top_rate self.other_rate = other_rate - self.unbalanced_sets = unbalanced_sets - self.min_split_gain = min_split_gain - self.max_depth = max_depth - self.min_child_weight = min_child_weight - self.subsample_freq = subsample_freq - self.subsample = subsample + self.minimum_split_gain = minimum_split_gain + self.maximum_tree_depth = maximum_tree_depth + self.minimum_child_weight = minimum_child_weight + self.subsample_frequency = subsample_frequency + self.subsample_fraction = subsample_fraction self.feature_fraction = feature_fraction - self.reg_lambda = reg_lambda - self.reg_alpha = reg_alpha - self.scale_pos_weight = scale_pos_weight + self.l2_regularization = l2_regularization + self.l1_regularization = l1_regularization self.kind = 'BoosterParameterFunction' self.name = 'goss' self.settings = {} @@ -146,38 +138,35 @@ def __init__( valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if unbalanced_sets is not None: - self.settings['UnbalancedSets'] = try_set( - obj=unbalanced_sets, none_acceptable=True, is_of_type=bool) - if min_split_gain is not None: - self.settings['MinSplitGain'] = try_set( - obj=min_split_gain, + if minimum_split_gain is not None: + self.settings['MinimumSplitGain'] = try_set( + obj=minimum_split_gain, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if max_depth is not None: - self.settings['MaxDepth'] = try_set( - obj=max_depth, + if maximum_tree_depth is not None: + self.settings['MaximumTreeDepth'] = try_set( + obj=maximum_tree_depth, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if min_child_weight is not None: - self.settings['MinChildWeight'] = try_set( - obj=min_child_weight, + if minimum_child_weight is not None: + self.settings['MinimumChildWeight'] = try_set( + obj=minimum_child_weight, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if subsample_freq is not None: - self.settings['SubsampleFreq'] = try_set( - obj=subsample_freq, + if subsample_frequency is not None: + self.settings['SubsampleFrequency'] = try_set( + obj=subsample_frequency, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if subsample is not None: - self.settings['Subsample'] = try_set( - obj=subsample, + if subsample_fraction is not None: + self.settings['SubsampleFraction'] = try_set( + obj=subsample_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ @@ -191,21 +180,16 @@ def __init__( valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if reg_lambda is not None: - self.settings['RegLambda'] = try_set( - obj=reg_lambda, + if l2_regularization is not None: + self.settings['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if reg_alpha is not None: - self.settings['RegAlpha'] = try_set( - obj=reg_alpha, + if l1_regularization is not None: + self.settings['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if scale_pos_weight is not None: - self.settings['ScalePosWeight'] = try_set( - obj=scale_pos_weight, - none_acceptable=True, - is_of_type=numbers.Real) super( Goss, diff --git a/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py index 3f351ef2..270584a3 100644 --- a/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/fastforestbinaryclassifier.py @@ -64,19 +64,20 @@ class FastForestBinaryClassifier( stumps-to-trees-to-forests/>`_ - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param normalize: If ``Auto``, the choice to normalize depends on the preference declared by the algorithm. This is the default choice. If @@ -86,22 +87,22 @@ class FastForestBinaryClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_tree_output: Upper bound on absolute value of single tree - output. + :param maximum_output_magnitude_per_tree: Upper bound on absolute value of + single tree output. - :param quantile_sample_count: Number of labels to be sampled from each leaf - to make the distribtuion. + :param number_of_quantile_samples: Number of labels to be sampled from each + leaf to make the distribution. :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -119,19 +120,19 @@ class FastForestBinaryClassifier( :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -140,7 +141,8 @@ class FastForestBinaryClassifier( Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -159,17 +161,18 @@ class FastForestBinaryClassifier( :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -180,9 +183,6 @@ class FastForestBinaryClassifier( :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -204,87 +204,85 @@ class FastForestBinaryClassifier( @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, normalize='Auto', caching='Auto', - max_tree_output=100.0, - quantile_sample_count=100, + maximum_output_magnitude_per_tree=100.0, + number_of_quantile_samples=100, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=0.7, bagging_size=1, - example_fraction=0.7, - split_fraction=0.7, + bagging_example_fraction=0.7, + feature_fraction_per_split=0.7, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, **params): BasePipelineItem.__init__( self, type='classifier', **params) - self.num_trees = num_trees - self.num_leaves = num_leaves - self.min_split = min_split + self.number_of_trees = number_of_trees + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.normalize = normalize self.caching = caching - self.max_tree_output = max_tree_output - self.quantile_sample_count = quantile_sample_count + self.maximum_output_magnitude_per_tree = maximum_output_magnitude_per_tree + self.number_of_quantile_samples = number_of_quantile_samples self.parallel_trainer = parallel_trainer - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.random_state = random_state - self.feature_select_seed = feature_select_seed + self.feature_selection_seed = feature_selection_seed self.entropy_coefficient = entropy_coefficient self.histogram_pool_size = histogram_pool_size self.disk_transpose = disk_transpose self.feature_flocks = feature_flocks self.categorical_split = categorical_split - self.max_categorical_groups_per_node = max_categorical_groups_per_node - self.max_categorical_split_points = max_categorical_split_points - self.min_docs_percentage_split = min_docs_percentage_split - self.min_docs_for_categorical_split = min_docs_for_categorical_split + self.maximum_categorical_group_count_per_node = maximum_categorical_group_count_per_node + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.minimum_example_fraction_for_categorical_split = minimum_example_fraction_for_categorical_split + self.minimum_examples_for_categorical_split = minimum_examples_for_categorical_split self.bias = bias self.bundling = bundling - self.num_bins = num_bins + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature self.sparsify_threshold = sparsify_threshold self.first_use_penalty = first_use_penalty self.feature_reuse_penalty = feature_reuse_penalty self.gain_conf_level = gain_conf_level self.softmax_temperature = softmax_temperature - self.execution_times = execution_times + self.execution_time = execution_time self.feature_fraction = feature_fraction self.bagging_size = bagging_size - self.example_fraction = example_fraction - self.split_fraction = split_fraction + self.bagging_example_fraction = bagging_example_fraction + self.feature_fraction_per_split = feature_fraction_per_split self.smoothing = smoothing self.allow_empty_trees = allow_empty_trees self.feature_compression_level = feature_compression_level self.compress_ensemble = compress_ensemble - self.max_trees_after_compression = max_trees_after_compression self.test_frequency = test_frequency @property @@ -294,48 +292,47 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_trees=self.num_trees, - num_leaves=self.num_leaves, - min_documents_in_leafs=self.min_split, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_trees=self.number_of_trees, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, normalize_features=self.normalize, caching=self.caching, - max_tree_output=self.max_tree_output, - quantile_sample_count=self.quantile_sample_count, + maximum_output_magnitude_per_tree=self.maximum_output_magnitude_per_tree, + number_of_quantile_samples=self.number_of_quantile_samples, parallel_trainer=self.parallel_trainer, - num_threads=self.train_threads, - rng_seed=self.random_state, - feature_select_seed=self.feature_select_seed, + number_of_threads=self.number_of_threads, + seed=self.random_state, + feature_selection_seed=self.feature_selection_seed, entropy_coefficient=self.entropy_coefficient, histogram_pool_size=self.histogram_pool_size, disk_transpose=self.disk_transpose, feature_flocks=self.feature_flocks, categorical_split=self.categorical_split, - max_categorical_groups_per_node=self.max_categorical_groups_per_node, - max_categorical_split_points=self.max_categorical_split_points, - min_docs_percentage_for_categorical_split=self.min_docs_percentage_split, - min_docs_for_categorical_split=self.min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=self.maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=self.minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=self.minimum_examples_for_categorical_split, bias=self.bias, bundling=self.bundling, - max_bins=self.num_bins, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, sparsify_threshold=self.sparsify_threshold, feature_first_use_penalty=self.first_use_penalty, feature_reuse_penalty=self.feature_reuse_penalty, gain_confidence_level=self.gain_conf_level, softmax_temperature=self.softmax_temperature, - execution_times=self.execution_times, + execution_time=self.execution_time, feature_fraction=self.feature_fraction, bagging_size=self.bagging_size, - bagging_train_fraction=self.example_fraction, - split_fraction=self.split_fraction, + bagging_example_fraction=self.bagging_example_fraction, + feature_fraction_per_split=self.feature_fraction_per_split, smoothing=self.smoothing, allow_empty_trees=self.allow_empty_trees, feature_compression_level=self.feature_compression_level, compress_ensemble=self.compress_ensemble, - max_trees_after_compression=self.max_trees_after_compression, test_frequency=self.test_frequency) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py b/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py index 918a466a..74698a6d 100644 --- a/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/fastforestregressor.py @@ -74,19 +74,20 @@ class FastForestRegressor( stumps-to-trees-to-forests/>`_ - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param normalize: If ``Auto``, the choice to normalize depends on the preference declared by the algorithm. This is the default choice. If @@ -96,23 +97,23 @@ class FastForestRegressor( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param shuffle_labels: Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass. - :param quantile_sample_count: Number of labels to be sampled from each leaf - to make the distribtuion. + :param number_of_quantile_samples: Number of labels to be sampled from each + leaf to make the distribution. :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -130,19 +131,19 @@ class FastForestRegressor( :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -151,7 +152,8 @@ class FastForestRegressor( Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -170,17 +172,18 @@ class FastForestRegressor( :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -191,9 +194,6 @@ class FastForestRegressor( :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -215,87 +215,85 @@ class FastForestRegressor( @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, normalize='Auto', caching='Auto', shuffle_labels=False, - quantile_sample_count=100, + number_of_quantile_samples=100, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=0.7, bagging_size=1, - example_fraction=0.7, - split_fraction=0.7, + bagging_example_fraction=0.7, + feature_fraction_per_split=0.7, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, **params): BasePipelineItem.__init__( self, type='regressor', **params) - self.num_trees = num_trees - self.num_leaves = num_leaves - self.min_split = min_split + self.number_of_trees = number_of_trees + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.normalize = normalize self.caching = caching self.shuffle_labels = shuffle_labels - self.quantile_sample_count = quantile_sample_count + self.number_of_quantile_samples = number_of_quantile_samples self.parallel_trainer = parallel_trainer - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.random_state = random_state - self.feature_select_seed = feature_select_seed + self.feature_selection_seed = feature_selection_seed self.entropy_coefficient = entropy_coefficient self.histogram_pool_size = histogram_pool_size self.disk_transpose = disk_transpose self.feature_flocks = feature_flocks self.categorical_split = categorical_split - self.max_categorical_groups_per_node = max_categorical_groups_per_node - self.max_categorical_split_points = max_categorical_split_points - self.min_docs_percentage_split = min_docs_percentage_split - self.min_docs_for_categorical_split = min_docs_for_categorical_split + self.maximum_categorical_group_count_per_node = maximum_categorical_group_count_per_node + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.minimum_example_fraction_for_categorical_split = minimum_example_fraction_for_categorical_split + self.minimum_examples_for_categorical_split = minimum_examples_for_categorical_split self.bias = bias self.bundling = bundling - self.num_bins = num_bins + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature self.sparsify_threshold = sparsify_threshold self.first_use_penalty = first_use_penalty self.feature_reuse_penalty = feature_reuse_penalty self.gain_conf_level = gain_conf_level self.softmax_temperature = softmax_temperature - self.execution_times = execution_times + self.execution_time = execution_time self.feature_fraction = feature_fraction self.bagging_size = bagging_size - self.example_fraction = example_fraction - self.split_fraction = split_fraction + self.bagging_example_fraction = bagging_example_fraction + self.feature_fraction_per_split = feature_fraction_per_split self.smoothing = smoothing self.allow_empty_trees = allow_empty_trees self.feature_compression_level = feature_compression_level self.compress_ensemble = compress_ensemble - self.max_trees_after_compression = max_trees_after_compression self.test_frequency = test_frequency @property @@ -305,48 +303,47 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_trees=self.num_trees, - num_leaves=self.num_leaves, - min_documents_in_leafs=self.min_split, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_trees=self.number_of_trees, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, normalize_features=self.normalize, caching=self.caching, shuffle_labels=self.shuffle_labels, - quantile_sample_count=self.quantile_sample_count, + number_of_quantile_samples=self.number_of_quantile_samples, parallel_trainer=self.parallel_trainer, - num_threads=self.train_threads, - rng_seed=self.random_state, - feature_select_seed=self.feature_select_seed, + number_of_threads=self.number_of_threads, + seed=self.random_state, + feature_selection_seed=self.feature_selection_seed, entropy_coefficient=self.entropy_coefficient, histogram_pool_size=self.histogram_pool_size, disk_transpose=self.disk_transpose, feature_flocks=self.feature_flocks, categorical_split=self.categorical_split, - max_categorical_groups_per_node=self.max_categorical_groups_per_node, - max_categorical_split_points=self.max_categorical_split_points, - min_docs_percentage_for_categorical_split=self.min_docs_percentage_split, - min_docs_for_categorical_split=self.min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=self.maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=self.minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=self.minimum_examples_for_categorical_split, bias=self.bias, bundling=self.bundling, - max_bins=self.num_bins, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, sparsify_threshold=self.sparsify_threshold, feature_first_use_penalty=self.first_use_penalty, feature_reuse_penalty=self.feature_reuse_penalty, gain_confidence_level=self.gain_conf_level, softmax_temperature=self.softmax_temperature, - execution_times=self.execution_times, + execution_time=self.execution_time, feature_fraction=self.feature_fraction, bagging_size=self.bagging_size, - bagging_train_fraction=self.example_fraction, - split_fraction=self.split_fraction, + bagging_example_fraction=self.bagging_example_fraction, + feature_fraction_per_split=self.feature_fraction_per_split, smoothing=self.smoothing, allow_empty_trees=self.allow_empty_trees, feature_compression_level=self.feature_compression_level, compress_ensemble=self.compress_ensemble, - max_trees_after_compression=self.max_trees_after_compression, test_frequency=self.test_frequency) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py index f5138708..37e5cd76 100644 --- a/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/fasttreesbinaryclassifier.py @@ -80,19 +80,20 @@ class FastTreesBinaryClassifier( `Greedy function approximation: A gradient boosting machine. `_ - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -109,18 +110,19 @@ class FastTreesBinaryClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param unbalanced_sets: Should we use derivatives optimized for unbalanced - sets. + :param unbalanced_sets: Option for using derivatives optimized for + unbalanced sets. - :param best_step_trees: Use best regression step trees?. + :param best_step_trees: Option for using best regression step trees. :param use_line_search: Should we use line search for a step size. - :param num_post_bracket_steps: Number of post-bracket line search steps. + :param maximum_number_of_line_search_steps: Number of post-bracket line + search steps. - :param min_step_size: Minimum line search step size. + :param minimum_step_size: Minimum line search step size. :param optimizer: Default is ``sgd``. @@ -149,7 +151,7 @@ class FastTreesBinaryClassifier( :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping. - :param max_tree_output: Upper bound on absolute value of single tree + :param maximum_tree_output: Upper bound on absolute value of single tree output. :param random_start: Training starts from random ordering (determined by @@ -164,17 +166,17 @@ class FastTreesBinaryClassifier( normal training). :param position_discount_freeform: The discount freeform which specifies - the per position discounts of documents in a query (uses a single + the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -192,19 +194,19 @@ class FastTreesBinaryClassifier( :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -213,7 +215,8 @@ class FastTreesBinaryClassifier( Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -232,17 +235,18 @@ class FastTreesBinaryClassifier( :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -253,9 +257,6 @@ class FastTreesBinaryClassifier( :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -277,20 +278,20 @@ class FastTreesBinaryClassifier( @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, learning_rate=0.2, normalize='Auto', caching='Auto', unbalanced_sets=False, best_step_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimizer='GradientDescent', early_stopping_rule=None, - early_stopping_metrics=0, + early_stopping_metrics=1, enable_pruning=False, use_tolerant_pruning=False, pruning_threshold=0.004, @@ -299,59 +300,58 @@ def __init__( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - example_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, **params): BasePipelineItem.__init__( self, type='classifier', **params) - self.num_trees = num_trees - self.num_leaves = num_leaves - self.min_split = min_split + self.number_of_trees = number_of_trees + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.learning_rate = learning_rate self.normalize = normalize self.caching = caching self.unbalanced_sets = unbalanced_sets self.best_step_trees = best_step_trees self.use_line_search = use_line_search - self.num_post_bracket_steps = num_post_bracket_steps - self.min_step_size = min_step_size + self.maximum_number_of_line_search_steps = maximum_number_of_line_search_steps + self.minimum_step_size = minimum_step_size self.optimizer = optimizer self.early_stopping_rule = early_stopping_rule self.early_stopping_metrics = early_stopping_metrics @@ -363,43 +363,42 @@ def __init__( self.dropout_rate = dropout_rate self.get_derivatives_sample_rate = get_derivatives_sample_rate self.write_last_ensemble = write_last_ensemble - self.max_tree_output = max_tree_output + self.maximum_tree_output = maximum_tree_output self.random_start = random_start self.filter_zero_lambdas = filter_zero_lambdas self.baseline_scores_formula = baseline_scores_formula self.baseline_alpha_risk = baseline_alpha_risk self.position_discount_freeform = position_discount_freeform self.parallel_trainer = parallel_trainer - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.random_state = random_state - self.feature_select_seed = feature_select_seed + self.feature_selection_seed = feature_selection_seed self.entropy_coefficient = entropy_coefficient self.histogram_pool_size = histogram_pool_size self.disk_transpose = disk_transpose self.feature_flocks = feature_flocks self.categorical_split = categorical_split - self.max_categorical_groups_per_node = max_categorical_groups_per_node - self.max_categorical_split_points = max_categorical_split_points - self.min_docs_percentage_split = min_docs_percentage_split - self.min_docs_for_categorical_split = min_docs_for_categorical_split + self.maximum_categorical_group_count_per_node = maximum_categorical_group_count_per_node + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.minimum_example_fraction_for_categorical_split = minimum_example_fraction_for_categorical_split + self.minimum_examples_for_categorical_split = minimum_examples_for_categorical_split self.bias = bias self.bundling = bundling - self.num_bins = num_bins + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature self.sparsify_threshold = sparsify_threshold self.first_use_penalty = first_use_penalty self.feature_reuse_penalty = feature_reuse_penalty self.gain_conf_level = gain_conf_level self.softmax_temperature = softmax_temperature - self.execution_times = execution_times + self.execution_time = execution_time self.feature_fraction = feature_fraction self.bagging_size = bagging_size - self.example_fraction = example_fraction - self.split_fraction = split_fraction + self.bagging_example_fraction = bagging_example_fraction + self.feature_fraction_per_split = feature_fraction_per_split self.smoothing = smoothing self.allow_empty_trees = allow_empty_trees self.feature_compression_level = feature_compression_level self.compress_ensemble = compress_ensemble - self.max_trees_after_compression = max_trees_after_compression self.test_frequency = test_frequency @property @@ -409,21 +408,21 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_trees=self.num_trees, - num_leaves=self.num_leaves, - min_documents_in_leafs=self.min_split, - learning_rates=self.learning_rate, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_trees=self.number_of_trees, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, + learning_rate=self.learning_rate, normalize_features=self.normalize, caching=self.caching, unbalanced_sets=self.unbalanced_sets, best_step_ranking_regression_trees=self.best_step_trees, use_line_search=self.use_line_search, - num_post_bracket_steps=self.num_post_bracket_steps, - min_step_size=self.min_step_size, + maximum_number_of_line_search_steps=self.maximum_number_of_line_search_steps, + minimum_step_size=self.minimum_step_size, optimization_algorithm=self.optimizer, early_stopping_rule=self.early_stopping_rule, early_stopping_metrics=self.early_stopping_metrics, @@ -435,43 +434,42 @@ def _get_node(self, **all_args): dropout_rate=self.dropout_rate, get_derivatives_sample_rate=self.get_derivatives_sample_rate, write_last_ensemble=self.write_last_ensemble, - max_tree_output=self.max_tree_output, + maximum_tree_output=self.maximum_tree_output, random_start=self.random_start, filter_zero_lambdas=self.filter_zero_lambdas, baseline_scores_formula=self.baseline_scores_formula, baseline_alpha_risk=self.baseline_alpha_risk, position_discount_freeform=self.position_discount_freeform, parallel_trainer=self.parallel_trainer, - num_threads=self.train_threads, - rng_seed=self.random_state, - feature_select_seed=self.feature_select_seed, + number_of_threads=self.number_of_threads, + seed=self.random_state, + feature_selection_seed=self.feature_selection_seed, entropy_coefficient=self.entropy_coefficient, histogram_pool_size=self.histogram_pool_size, disk_transpose=self.disk_transpose, feature_flocks=self.feature_flocks, categorical_split=self.categorical_split, - max_categorical_groups_per_node=self.max_categorical_groups_per_node, - max_categorical_split_points=self.max_categorical_split_points, - min_docs_percentage_for_categorical_split=self.min_docs_percentage_split, - min_docs_for_categorical_split=self.min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=self.maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=self.minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=self.minimum_examples_for_categorical_split, bias=self.bias, bundling=self.bundling, - max_bins=self.num_bins, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, sparsify_threshold=self.sparsify_threshold, feature_first_use_penalty=self.first_use_penalty, feature_reuse_penalty=self.feature_reuse_penalty, gain_confidence_level=self.gain_conf_level, softmax_temperature=self.softmax_temperature, - execution_times=self.execution_times, + execution_time=self.execution_time, feature_fraction=self.feature_fraction, bagging_size=self.bagging_size, - bagging_train_fraction=self.example_fraction, - split_fraction=self.split_fraction, + bagging_example_fraction=self.bagging_example_fraction, + feature_fraction_per_split=self.feature_fraction_per_split, smoothing=self.smoothing, allow_empty_trees=self.allow_empty_trees, feature_compression_level=self.feature_compression_level, compress_ensemble=self.compress_ensemble, - max_trees_after_compression=self.max_trees_after_compression, test_frequency=self.test_frequency) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py b/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py index d041e9b8..3ee724c4 100644 --- a/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/fasttreesregressor.py @@ -85,19 +85,20 @@ class FastTreesRegressor( `Greedy function approximation: A gradient boosting machine. `_ - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -114,15 +115,16 @@ class FastTreesRegressor( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param best_step_trees: Use best regression step trees?. + :param best_step_trees: Option for using best regression step trees. :param use_line_search: Should we use line search for a step size. - :param num_post_bracket_steps: Number of post-bracket line search steps. + :param maximum_number_of_line_search_steps: Number of post-bracket line + search steps. - :param min_step_size: Minimum line search step size. + :param minimum_step_size: Minimum line search step size. :param optimizer: Default is ``sgd``. @@ -151,7 +153,7 @@ class FastTreesRegressor( :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping. - :param max_tree_output: Upper bound on absolute value of single tree + :param maximum_tree_output: Upper bound on absolute value of single tree output. :param random_start: Training starts from random ordering (determined by @@ -166,17 +168,17 @@ class FastTreesRegressor( normal training). :param position_discount_freeform: The discount freeform which specifies - the per position discounts of documents in a query (uses a single + the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -194,19 +196,19 @@ class FastTreesRegressor( :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -215,7 +217,8 @@ class FastTreesRegressor( Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -234,17 +237,18 @@ class FastTreesRegressor( :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -255,9 +259,6 @@ class FastTreesRegressor( :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -279,16 +280,16 @@ class FastTreesRegressor( @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, learning_rate=0.2, normalize='Auto', caching='Auto', best_step_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimizer='GradientDescent', early_stopping_rule=None, early_stopping_metrics=1, @@ -300,58 +301,57 @@ def __init__( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - example_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, **params): BasePipelineItem.__init__( self, type='regressor', **params) - self.num_trees = num_trees - self.num_leaves = num_leaves - self.min_split = min_split + self.number_of_trees = number_of_trees + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.learning_rate = learning_rate self.normalize = normalize self.caching = caching self.best_step_trees = best_step_trees self.use_line_search = use_line_search - self.num_post_bracket_steps = num_post_bracket_steps - self.min_step_size = min_step_size + self.maximum_number_of_line_search_steps = maximum_number_of_line_search_steps + self.minimum_step_size = minimum_step_size self.optimizer = optimizer self.early_stopping_rule = early_stopping_rule self.early_stopping_metrics = early_stopping_metrics @@ -363,43 +363,42 @@ def __init__( self.dropout_rate = dropout_rate self.get_derivatives_sample_rate = get_derivatives_sample_rate self.write_last_ensemble = write_last_ensemble - self.max_tree_output = max_tree_output + self.maximum_tree_output = maximum_tree_output self.random_start = random_start self.filter_zero_lambdas = filter_zero_lambdas self.baseline_scores_formula = baseline_scores_formula self.baseline_alpha_risk = baseline_alpha_risk self.position_discount_freeform = position_discount_freeform self.parallel_trainer = parallel_trainer - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.random_state = random_state - self.feature_select_seed = feature_select_seed + self.feature_selection_seed = feature_selection_seed self.entropy_coefficient = entropy_coefficient self.histogram_pool_size = histogram_pool_size self.disk_transpose = disk_transpose self.feature_flocks = feature_flocks self.categorical_split = categorical_split - self.max_categorical_groups_per_node = max_categorical_groups_per_node - self.max_categorical_split_points = max_categorical_split_points - self.min_docs_percentage_split = min_docs_percentage_split - self.min_docs_for_categorical_split = min_docs_for_categorical_split + self.maximum_categorical_group_count_per_node = maximum_categorical_group_count_per_node + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.minimum_example_fraction_for_categorical_split = minimum_example_fraction_for_categorical_split + self.minimum_examples_for_categorical_split = minimum_examples_for_categorical_split self.bias = bias self.bundling = bundling - self.num_bins = num_bins + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature self.sparsify_threshold = sparsify_threshold self.first_use_penalty = first_use_penalty self.feature_reuse_penalty = feature_reuse_penalty self.gain_conf_level = gain_conf_level self.softmax_temperature = softmax_temperature - self.execution_times = execution_times + self.execution_time = execution_time self.feature_fraction = feature_fraction self.bagging_size = bagging_size - self.example_fraction = example_fraction - self.split_fraction = split_fraction + self.bagging_example_fraction = bagging_example_fraction + self.feature_fraction_per_split = feature_fraction_per_split self.smoothing = smoothing self.allow_empty_trees = allow_empty_trees self.feature_compression_level = feature_compression_level self.compress_ensemble = compress_ensemble - self.max_trees_after_compression = max_trees_after_compression self.test_frequency = test_frequency @property @@ -409,20 +408,20 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_trees=self.num_trees, - num_leaves=self.num_leaves, - min_documents_in_leafs=self.min_split, - learning_rates=self.learning_rate, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_trees=self.number_of_trees, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, + learning_rate=self.learning_rate, normalize_features=self.normalize, caching=self.caching, best_step_ranking_regression_trees=self.best_step_trees, use_line_search=self.use_line_search, - num_post_bracket_steps=self.num_post_bracket_steps, - min_step_size=self.min_step_size, + maximum_number_of_line_search_steps=self.maximum_number_of_line_search_steps, + minimum_step_size=self.minimum_step_size, optimization_algorithm=self.optimizer, early_stopping_rule=self.early_stopping_rule, early_stopping_metrics=self.early_stopping_metrics, @@ -434,43 +433,42 @@ def _get_node(self, **all_args): dropout_rate=self.dropout_rate, get_derivatives_sample_rate=self.get_derivatives_sample_rate, write_last_ensemble=self.write_last_ensemble, - max_tree_output=self.max_tree_output, + maximum_tree_output=self.maximum_tree_output, random_start=self.random_start, filter_zero_lambdas=self.filter_zero_lambdas, baseline_scores_formula=self.baseline_scores_formula, baseline_alpha_risk=self.baseline_alpha_risk, position_discount_freeform=self.position_discount_freeform, parallel_trainer=self.parallel_trainer, - num_threads=self.train_threads, - rng_seed=self.random_state, - feature_select_seed=self.feature_select_seed, + number_of_threads=self.number_of_threads, + seed=self.random_state, + feature_selection_seed=self.feature_selection_seed, entropy_coefficient=self.entropy_coefficient, histogram_pool_size=self.histogram_pool_size, disk_transpose=self.disk_transpose, feature_flocks=self.feature_flocks, categorical_split=self.categorical_split, - max_categorical_groups_per_node=self.max_categorical_groups_per_node, - max_categorical_split_points=self.max_categorical_split_points, - min_docs_percentage_for_categorical_split=self.min_docs_percentage_split, - min_docs_for_categorical_split=self.min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=self.maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=self.minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=self.minimum_examples_for_categorical_split, bias=self.bias, bundling=self.bundling, - max_bins=self.num_bins, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, sparsify_threshold=self.sparsify_threshold, feature_first_use_penalty=self.first_use_penalty, feature_reuse_penalty=self.feature_reuse_penalty, gain_confidence_level=self.gain_conf_level, softmax_temperature=self.softmax_temperature, - execution_times=self.execution_times, + execution_time=self.execution_time, feature_fraction=self.feature_fraction, bagging_size=self.bagging_size, - bagging_train_fraction=self.example_fraction, - split_fraction=self.split_fraction, + bagging_example_fraction=self.bagging_example_fraction, + feature_fraction_per_split=self.feature_fraction_per_split, smoothing=self.smoothing, allow_empty_trees=self.allow_empty_trees, feature_compression_level=self.feature_compression_level, compress_ensemble=self.compress_ensemble, - max_trees_after_compression=self.max_trees_after_compression, test_frequency=self.test_frequency) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py b/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py index ccda9375..f9340f5d 100644 --- a/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/fasttreestweedieregressor.py @@ -37,19 +37,20 @@ class FastTreesTweedieRegressor( `Greedy function approximation: A gradient boosting machine. `_ - :param num_trees: Specifies the total number of decision trees to create in - the ensemble. By creating more decision trees, you can potentially get - better coverage, but the training time increases. + :param number_of_trees: Specifies the total number of decision trees to + create in the ensemble. By creating more decision trees, you can + potentially get better coverage, but the training time increases. - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. - :param min_split: Minimum number of training instances required to form a - leaf. That is, the minimal number of documents allowed in a leaf of - regression tree, out of the sub-sampled data. A 'split' means that - features in each level of the tree (node) are randomly divided. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -80,19 +81,20 @@ class FastTreesTweedieRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param index: Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss. - :param best_step_trees: Use best regression step trees?. + :param best_step_trees: Option for using best regression step trees. :param use_line_search: Should we use line search for a step size. - :param num_post_bracket_steps: Number of post-bracket line search steps. + :param maximum_number_of_line_search_steps: Number of post-bracket line + search steps. - :param min_step_size: Minimum line search step size. + :param minimum_step_size: Minimum line search step size. :param optimizer: Default is ``sgd``. @@ -121,7 +123,7 @@ class FastTreesTweedieRegressor( :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping. - :param max_tree_output: Upper bound on absolute value of single tree + :param maximum_tree_output: Upper bound on absolute value of single tree output. :param random_start: Training starts from random ordering (determined by @@ -136,17 +138,17 @@ class FastTreesTweedieRegressor( normal training). :param position_discount_freeform: The discount freeform which specifies - the per position discounts of documents in a query (uses a single + the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm. - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param random_state: The seed of the random number generator. - :param feature_select_seed: The seed of the active feature selection. + :param feature_selection_seed: The seed of the active feature selection. :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1. @@ -164,19 +166,19 @@ class FastTreesTweedieRegressor( :param categorical_split: Whether to do split based on multiple categorical feature values. - :param max_categorical_groups_per_node: Maximum categorical split groups to - consider when splitting on a categorical feature. Split groups are a - collection of split points. This is used to reduce overfitting when - there many categorical features. + :param maximum_categorical_group_count_per_node: Maximum categorical split + groups to consider when splitting on a categorical feature. Split + groups are a collection of split points. This is used to reduce + overfitting when there many categorical features. - :param max_categorical_split_points: Maximum categorical split points to - consider when splitting on a categorical feature. + :param maximum_categorical_split_point_count: Maximum categorical split + points to consider when splitting on a categorical feature. - :param min_docs_percentage_split: Minimum categorical docs percentage in a - bin to consider for a split. + :param minimum_example_fraction_for_categorical_split: Minimum categorical + example percentage in a bin to consider for a split. - :param min_docs_for_categorical_split: Minimum categorical doc count in a - bin to consider for a split. + :param minimum_examples_for_categorical_split: Minimum categorical example + count in a bin to consider for a split. :param bias: Bias for calculating gradient for each feature bin for a categorical feature. @@ -185,7 +187,8 @@ class FastTreesTweedieRegressor( Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. :param sparsify_threshold: Sparsity level needed to use sparse feature representation. @@ -204,17 +207,18 @@ class FastTreesTweedieRegressor( :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature. - :param execution_times: Print execution time breakdown to stdout. + :param execution_time: Print execution time breakdown to stdout. :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration. :param bagging_size: Number of trees in each bag (0 for disabling bagging). - :param example_fraction: Percentage of training examples used in each bag. + :param bagging_example_fraction: Percentage of training examples used in + each bag. - :param split_fraction: The fraction of features (chosen randomly) to use on - each split. + :param feature_fraction_per_split: The fraction of features (chosen + randomly) to use on each split. :param smoothing: Smoothing paramter for tree regularization. @@ -225,9 +229,6 @@ class FastTreesTweedieRegressor( :param compress_ensemble: Compress the tree Ensemble. - :param max_trees_after_compression: Maximum Number of trees after - compression. - :param test_frequency: Calculate metric values for train/valid/test every k rounds. @@ -250,20 +251,20 @@ class FastTreesTweedieRegressor( @trace def __init__( self, - num_trees=100, - num_leaves=20, - min_split=10, + number_of_trees=100, + number_of_leaves=20, + minimum_example_count_per_leaf=10, learning_rate=0.2, normalize='Auto', caching='Auto', index=1.5, best_step_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimizer='GradientDescent', early_stopping_rule=None, - early_stopping_metrics=0, + early_stopping_metrics=1, enable_pruning=False, use_tolerant_pruning=False, pruning_threshold=0.004, @@ -272,59 +273,58 @@ def __init__( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - train_threads=None, + number_of_threads=None, random_state=123, - feature_select_seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - num_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_conf_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - example_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, test_frequency=2147483647, **params): BasePipelineItem.__init__( self, type='regressor', **params) - self.num_trees = num_trees - self.num_leaves = num_leaves - self.min_split = min_split + self.number_of_trees = number_of_trees + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.learning_rate = learning_rate self.normalize = normalize self.caching = caching self.index = index self.best_step_trees = best_step_trees self.use_line_search = use_line_search - self.num_post_bracket_steps = num_post_bracket_steps - self.min_step_size = min_step_size + self.maximum_number_of_line_search_steps = maximum_number_of_line_search_steps + self.minimum_step_size = minimum_step_size self.optimizer = optimizer self.early_stopping_rule = early_stopping_rule self.early_stopping_metrics = early_stopping_metrics @@ -336,43 +336,42 @@ def __init__( self.dropout_rate = dropout_rate self.get_derivatives_sample_rate = get_derivatives_sample_rate self.write_last_ensemble = write_last_ensemble - self.max_tree_output = max_tree_output + self.maximum_tree_output = maximum_tree_output self.random_start = random_start self.filter_zero_lambdas = filter_zero_lambdas self.baseline_scores_formula = baseline_scores_formula self.baseline_alpha_risk = baseline_alpha_risk self.position_discount_freeform = position_discount_freeform self.parallel_trainer = parallel_trainer - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.random_state = random_state - self.feature_select_seed = feature_select_seed + self.feature_selection_seed = feature_selection_seed self.entropy_coefficient = entropy_coefficient self.histogram_pool_size = histogram_pool_size self.disk_transpose = disk_transpose self.feature_flocks = feature_flocks self.categorical_split = categorical_split - self.max_categorical_groups_per_node = max_categorical_groups_per_node - self.max_categorical_split_points = max_categorical_split_points - self.min_docs_percentage_split = min_docs_percentage_split - self.min_docs_for_categorical_split = min_docs_for_categorical_split + self.maximum_categorical_group_count_per_node = maximum_categorical_group_count_per_node + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.minimum_example_fraction_for_categorical_split = minimum_example_fraction_for_categorical_split + self.minimum_examples_for_categorical_split = minimum_examples_for_categorical_split self.bias = bias self.bundling = bundling - self.num_bins = num_bins + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature self.sparsify_threshold = sparsify_threshold self.first_use_penalty = first_use_penalty self.feature_reuse_penalty = feature_reuse_penalty self.gain_conf_level = gain_conf_level self.softmax_temperature = softmax_temperature - self.execution_times = execution_times + self.execution_time = execution_time self.feature_fraction = feature_fraction self.bagging_size = bagging_size - self.example_fraction = example_fraction - self.split_fraction = split_fraction + self.bagging_example_fraction = bagging_example_fraction + self.feature_fraction_per_split = feature_fraction_per_split self.smoothing = smoothing self.allow_empty_trees = allow_empty_trees self.feature_compression_level = feature_compression_level self.compress_ensemble = compress_ensemble - self.max_trees_after_compression = max_trees_after_compression self.test_frequency = test_frequency @property @@ -382,21 +381,21 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_trees=self.num_trees, - num_leaves=self.num_leaves, - min_documents_in_leafs=self.min_split, - learning_rates=self.learning_rate, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_trees=self.number_of_trees, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, + learning_rate=self.learning_rate, normalize_features=self.normalize, caching=self.caching, index=self.index, best_step_ranking_regression_trees=self.best_step_trees, use_line_search=self.use_line_search, - num_post_bracket_steps=self.num_post_bracket_steps, - min_step_size=self.min_step_size, + maximum_number_of_line_search_steps=self.maximum_number_of_line_search_steps, + minimum_step_size=self.minimum_step_size, optimization_algorithm=self.optimizer, early_stopping_rule=self.early_stopping_rule, early_stopping_metrics=self.early_stopping_metrics, @@ -408,43 +407,42 @@ def _get_node(self, **all_args): dropout_rate=self.dropout_rate, get_derivatives_sample_rate=self.get_derivatives_sample_rate, write_last_ensemble=self.write_last_ensemble, - max_tree_output=self.max_tree_output, + maximum_tree_output=self.maximum_tree_output, random_start=self.random_start, filter_zero_lambdas=self.filter_zero_lambdas, baseline_scores_formula=self.baseline_scores_formula, baseline_alpha_risk=self.baseline_alpha_risk, position_discount_freeform=self.position_discount_freeform, parallel_trainer=self.parallel_trainer, - num_threads=self.train_threads, - rng_seed=self.random_state, - feature_select_seed=self.feature_select_seed, + number_of_threads=self.number_of_threads, + seed=self.random_state, + feature_selection_seed=self.feature_selection_seed, entropy_coefficient=self.entropy_coefficient, histogram_pool_size=self.histogram_pool_size, disk_transpose=self.disk_transpose, feature_flocks=self.feature_flocks, categorical_split=self.categorical_split, - max_categorical_groups_per_node=self.max_categorical_groups_per_node, - max_categorical_split_points=self.max_categorical_split_points, - min_docs_percentage_for_categorical_split=self.min_docs_percentage_split, - min_docs_for_categorical_split=self.min_docs_for_categorical_split, + maximum_categorical_group_count_per_node=self.maximum_categorical_group_count_per_node, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + minimum_example_fraction_for_categorical_split=self.minimum_example_fraction_for_categorical_split, + minimum_examples_for_categorical_split=self.minimum_examples_for_categorical_split, bias=self.bias, bundling=self.bundling, - max_bins=self.num_bins, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, sparsify_threshold=self.sparsify_threshold, feature_first_use_penalty=self.first_use_penalty, feature_reuse_penalty=self.feature_reuse_penalty, gain_confidence_level=self.gain_conf_level, softmax_temperature=self.softmax_temperature, - execution_times=self.execution_times, + execution_time=self.execution_time, feature_fraction=self.feature_fraction, bagging_size=self.bagging_size, - bagging_train_fraction=self.example_fraction, - split_fraction=self.split_fraction, + bagging_example_fraction=self.bagging_example_fraction, + feature_fraction_per_split=self.feature_fraction_per_split, smoothing=self.smoothing, allow_empty_trees=self.allow_empty_trees, feature_compression_level=self.feature_compression_level, compress_ensemble=self.compress_ensemble, - max_trees_after_compression=self.max_trees_after_compression, test_frequency=self.test_frequency) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py index 1d0eecea..56d90d7e 100644 --- a/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/gambinaryclassifier.py @@ -81,10 +81,13 @@ class GamBinaryClassifier( `_ - :param num_iterations: Total number of iterations over all features. + :param number_of_iterations: Total number of iterations over all features. - :param min_documents: Minimum number of training instances required to form - a partition. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -115,7 +118,7 @@ class GamBinaryClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param unbalanced_sets: Should we use derivatives optimized for unbalanced sets. @@ -126,15 +129,16 @@ class GamBinaryClassifier( :param gain_conf_level: Tree fitting gain confidence requirement (should be in the range [0,1) ). - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param disk_transpose: Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. - :param max_output: Upper bound on absolute value of single output. + :param maximum_tree_output: Upper bound on absolute value of single output. :param get_derivatives_sample_rate: Sample each query 1 in k times in the GetDerivatives function. @@ -165,18 +169,18 @@ class GamBinaryClassifier( @trace def __init__( self, - num_iterations=9500, - min_documents=10, + number_of_iterations=9500, + minimum_example_count_per_leaf=10, learning_rate=0.002, normalize='Auto', caching='Auto', unbalanced_sets=False, entropy_coefficient=0.0, gain_conf_level=0, - train_threads=None, + number_of_threads=None, disk_transpose=None, - num_bins=255, - max_output=float('inf'), + maximum_bin_count_per_feature=255, + maximum_tree_output=float('inf'), get_derivatives_sample_rate=1, random_state=123, feature_flocks=True, @@ -185,18 +189,18 @@ def __init__( BasePipelineItem.__init__( self, type='classifier', **params) - self.num_iterations = num_iterations - self.min_documents = min_documents + self.number_of_iterations = number_of_iterations + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.learning_rate = learning_rate self.normalize = normalize self.caching = caching self.unbalanced_sets = unbalanced_sets self.entropy_coefficient = entropy_coefficient self.gain_conf_level = gain_conf_level - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.disk_transpose = disk_transpose - self.num_bins = num_bins - self.max_output = max_output + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature + self.maximum_tree_output = maximum_tree_output self.get_derivatives_sample_rate = get_derivatives_sample_rate self.random_state = random_state self.feature_flocks = feature_flocks @@ -209,23 +213,29 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - num_iterations=self.num_iterations, - min_documents=self.min_documents, - learning_rates=self.learning_rate, + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', + all_args), + number_of_iterations=self.number_of_iterations, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, + learning_rate=self.learning_rate, normalize_features=self.normalize, caching=self.caching, unbalanced_sets=self.unbalanced_sets, entropy_coefficient=self.entropy_coefficient, gain_confidence_level=self.gain_conf_level, - num_threads=self.train_threads, + number_of_threads=self.number_of_threads, disk_transpose=self.disk_transpose, - max_bins=self.num_bins, - max_output=self.max_output, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, + maximum_tree_output=self.maximum_tree_output, get_derivatives_sample_rate=self.get_derivatives_sample_rate, - rng_seed=self.random_state, + seed=self.random_state, feature_flocks=self.feature_flocks, enable_pruning=self.enable_pruning) diff --git a/src/python/nimbusml/internal/core/ensemble/gamregressor.py b/src/python/nimbusml/internal/core/ensemble/gamregressor.py index 07a093c6..048bf874 100644 --- a/src/python/nimbusml/internal/core/ensemble/gamregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/gamregressor.py @@ -79,10 +79,13 @@ class GamRegressor(BasePipelineItem, DefaultSignatureWithRoles): `_ - :param num_iterations: Total number of iterations over all features. + :param number_of_iterations: Total number of iterations over all features. - :param min_documents: Minimum number of training instances required to form - a partition. + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param learning_rate: Determines the size of the step taken in the direction of the gradient in each step of the learning process. This @@ -113,7 +116,7 @@ class GamRegressor(BasePipelineItem, DefaultSignatureWithRoles): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param pruning_metrics: Metric for pruning. (For regression, 1: L1, 2:L2; default L2). @@ -124,15 +127,16 @@ class GamRegressor(BasePipelineItem, DefaultSignatureWithRoles): :param gain_conf_level: Tree fitting gain confidence requirement (should be in the range [0,1) ). - :param train_threads: The number of threads to use. + :param number_of_threads: The number of threads to use. :param disk_transpose: Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose. - :param num_bins: Maximum number of distinct values (bins) per feature. + :param maximum_bin_count_per_feature: Maximum number of distinct values + (bins) per feature. - :param max_output: Upper bound on absolute value of single output. + :param maximum_tree_output: Upper bound on absolute value of single output. :param get_derivatives_sample_rate: Sample each query 1 in k times in the GetDerivatives function. @@ -164,18 +168,18 @@ class GamRegressor(BasePipelineItem, DefaultSignatureWithRoles): @trace def __init__( self, - num_iterations=9500, - min_documents=10, + number_of_iterations=9500, + minimum_example_count_per_leaf=10, learning_rate=0.002, normalize='Auto', caching='Auto', pruning_metrics=2, entropy_coefficient=0.0, gain_conf_level=0, - train_threads=None, + number_of_threads=None, disk_transpose=None, - num_bins=255, - max_output=float('inf'), + maximum_bin_count_per_feature=255, + maximum_tree_output=float('inf'), get_derivatives_sample_rate=1, random_state=123, feature_flocks=True, @@ -184,18 +188,18 @@ def __init__( BasePipelineItem.__init__( self, type='regressor', **params) - self.num_iterations = num_iterations - self.min_documents = min_documents + self.number_of_iterations = number_of_iterations + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.learning_rate = learning_rate self.normalize = normalize self.caching = caching self.pruning_metrics = pruning_metrics self.entropy_coefficient = entropy_coefficient self.gain_conf_level = gain_conf_level - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.disk_transpose = disk_transpose - self.num_bins = num_bins - self.max_output = max_output + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature + self.maximum_tree_output = maximum_tree_output self.get_derivatives_sample_rate = get_derivatives_sample_rate self.random_state = random_state self.feature_flocks = feature_flocks @@ -208,23 +212,29 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - num_iterations=self.num_iterations, - min_documents=self.min_documents, - learning_rates=self.learning_rate, + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', + all_args), + number_of_iterations=self.number_of_iterations, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, + learning_rate=self.learning_rate, normalize_features=self.normalize, caching=self.caching, pruning_metrics=self.pruning_metrics, entropy_coefficient=self.entropy_coefficient, gain_confidence_level=self.gain_conf_level, - num_threads=self.train_threads, + number_of_threads=self.number_of_threads, disk_transpose=self.disk_transpose, - max_bins=self.num_bins, - max_output=self.max_output, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, + maximum_tree_output=self.maximum_tree_output, get_derivatives_sample_rate=self.get_derivatives_sample_rate, - rng_seed=self.random_state, + seed=self.random_state, feature_flocks=self.feature_flocks, enable_pruning=self.enable_pruning) diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py index 03622654..2bf8468b 100644 --- a/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py @@ -34,17 +34,25 @@ class LightGbmBinaryClassifier( `GitHub: LightGBM `_ - :param num_boost_round: Number of iterations. - - :param learning_rate: Shrinkage rate for trees, used to prevent over- - fitting. Range: (0,1]. - - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. - - :param min_data_per_leaf: Minimum number of instances needed in a child. + :param number_of_iterations: Number of iterations. + + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. + + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. + + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param booster: Which booster to use. Available options are: @@ -60,43 +68,50 @@ class LightGbmBinaryClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_bin: Max number of bucket bin for features. + :param unbalanced_sets: Use for binary classification when training data is + not balanced. - :param verbose_eval: Verbose. + :param weight_of_positive_examples: Control the balance of positive and + negative weights, useful for unbalanced classes. A typical value to + consider: sum(negative cases) / sum(positive cases). - :param silent: Printing running messages. + :param sigmoid: Parameter for the sigmoid function. - :param n_thread: Number of parallel threads used to run LightGBM. + :param evaluation_metric: Evaluation metrics. - :param eval_metric: Evaluation metrics. + :param maximum_bin_count_per_feature: Maximum number of bucket bin for + features. - :param use_softmax: Use softmax loss for the multi classification. + :param verbose: Verbose. - :param early_stopping_round: Rounds of early stopping, 0 will disable it. + :param silent: Printing running messages. - :param custom_gains: Comma seperated list of gains associated to each - relevance label. + :param number_of_threads: Number of parallel threads used to run LightGBM. - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. + :param early_stopping_round: Rounds of early stopping, 0 will disable it. :param batch_size: Number of entries in a batch when loading data. - :param use_cat: Enable categorical split or not. + :param use_categorical_split: Enable categorical split or not. - :param use_missing: Enable missing value auto infer or not. + :param handle_missing_value: Enable special handling of missing value or + not. - :param min_data_per_group: Min number of instances per categorical group. + :param minimum_example_count_per_group: Minimum number of instances per + categorical group. - :param max_cat_threshold: Max number of categorical thresholds. + :param maximum_categorical_split_point_count: Max number of categorical + thresholds. - :param cat_smooth: Lapalace smooth term in categorical feature spilt. Avoid - the bias of small categories. + :param categorical_smoothing: Lapalace smooth term in categorical feature + spilt. Avoid the bias of small categories. - :param cat_l2: L2 Regularization for categorical split. + :param l2_categorical_regularization: L2 Regularization for categorical + split. + + :param random_state: Sets the random seed for LightGBM to use. :param parallel_trainer: Parallel LightGBM Learning Algorithm. @@ -120,57 +135,59 @@ class LightGbmBinaryClassifier( @trace def __init__( self, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, + number_of_leaves=None, + minimum_example_count_per_leaf=None, booster=None, normalize='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + unbalanced_sets=False, + weight_of_positive_examples=1.0, + sigmoid=0.5, + evaluation_metric='Logloss', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + random_state=None, parallel_trainer=None, **params): BasePipelineItem.__init__( self, type='classifier', **params) - self.num_boost_round = num_boost_round + self.number_of_iterations = number_of_iterations self.learning_rate = learning_rate - self.num_leaves = num_leaves - self.min_data_per_leaf = min_data_per_leaf + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.booster = booster self.normalize = normalize self.caching = caching - self.max_bin = max_bin - self.verbose_eval = verbose_eval + self.unbalanced_sets = unbalanced_sets + self.weight_of_positive_examples = weight_of_positive_examples + self.sigmoid = sigmoid + self.evaluation_metric = evaluation_metric + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature + self.verbose = verbose self.silent = silent - self.n_thread = n_thread - self.eval_metric = eval_metric - self.use_softmax = use_softmax + self.number_of_threads = number_of_threads self.early_stopping_round = early_stopping_round - self.custom_gains = custom_gains - self.sigmoid = sigmoid self.batch_size = batch_size - self.use_cat = use_cat - self.use_missing = use_missing - self.min_data_per_group = min_data_per_group - self.max_cat_threshold = max_cat_threshold - self.cat_smooth = cat_smooth - self.cat_l2 = cat_l2 + self.use_categorical_split = use_categorical_split + self.handle_missing_value = handle_missing_value + self.minimum_example_count_per_group = minimum_example_count_per_group + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.categorical_smoothing = categorical_smoothing + self.l2_categorical_regularization = l2_categorical_regularization + self.random_state = random_state self.parallel_trainer = parallel_trainer @property @@ -180,33 +197,34 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_boost_round=self.num_boost_round, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_iterations=self.number_of_iterations, learning_rate=self.learning_rate, - num_leaves=self.num_leaves, - min_data_per_leaf=self.min_data_per_leaf, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, booster=self.booster, normalize_features=self.normalize, caching=self.caching, - max_bin=self.max_bin, - verbose_eval=self.verbose_eval, + unbalanced_sets=self.unbalanced_sets, + weight_of_positive_examples=self.weight_of_positive_examples, + sigmoid=self.sigmoid, + evaluation_metric=self.evaluation_metric, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, + verbose=self.verbose, silent=self.silent, - n_thread=self.n_thread, - eval_metric=self.eval_metric, - use_softmax=self.use_softmax, + number_of_threads=self.number_of_threads, early_stopping_round=self.early_stopping_round, - custom_gains=self.custom_gains, - sigmoid=self.sigmoid, batch_size=self.batch_size, - use_cat=self.use_cat, - use_missing=self.use_missing, - min_data_per_group=self.min_data_per_group, - max_cat_threshold=self.max_cat_threshold, - cat_smooth=self.cat_smooth, - cat_l2=self.cat_l2, + use_categorical_split=self.use_categorical_split, + handle_missing_value=self.handle_missing_value, + minimum_example_count_per_group=self.minimum_example_count_per_group, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + categorical_smoothing=self.categorical_smoothing, + l2_categorical_regularization=self.l2_categorical_regularization, + seed=self.random_state, parallel_trainer=self.parallel_trainer) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py b/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py index 690c30b4..ca87aa7b 100644 --- a/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py @@ -34,17 +34,25 @@ class LightGbmClassifier( `GitHub: LightGBM `_ - :param num_boost_round: Number of iterations. - - :param learning_rate: Shrinkage rate for trees, used to prevent over- - fitting. Range: (0,1]. - - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. - - :param min_data_per_leaf: Minimum number of instances needed in a child. + :param number_of_iterations: Number of iterations. + + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. + + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. + + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param booster: Which booster to use. Available options are: @@ -60,43 +68,45 @@ class LightGbmClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_bin: Max number of bucket bin for features. - - :param verbose_eval: Verbose. + :param use_softmax: Use softmax loss for the multi classification. - :param silent: Printing running messages. + :param sigmoid: Parameter for the sigmoid function. - :param n_thread: Number of parallel threads used to run LightGBM. + :param evaluation_metric: Evaluation metrics. - :param eval_metric: Evaluation metrics. + :param maximum_bin_count_per_feature: Maximum number of bucket bin for + features. - :param use_softmax: Use softmax loss for the multi classification. + :param verbose: Verbose. - :param early_stopping_round: Rounds of early stopping, 0 will disable it. + :param silent: Printing running messages. - :param custom_gains: Comma seperated list of gains associated to each - relevance label. + :param number_of_threads: Number of parallel threads used to run LightGBM. - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. + :param early_stopping_round: Rounds of early stopping, 0 will disable it. :param batch_size: Number of entries in a batch when loading data. - :param use_cat: Enable categorical split or not. + :param use_categorical_split: Enable categorical split or not. + + :param handle_missing_value: Enable special handling of missing value or + not. - :param use_missing: Enable missing value auto infer or not. + :param minimum_example_count_per_group: Minimum number of instances per + categorical group. - :param min_data_per_group: Min number of instances per categorical group. + :param maximum_categorical_split_point_count: Max number of categorical + thresholds. - :param max_cat_threshold: Max number of categorical thresholds. + :param categorical_smoothing: Lapalace smooth term in categorical feature + spilt. Avoid the bias of small categories. - :param cat_smooth: Lapalace smooth term in categorical feature spilt. Avoid - the bias of small categories. + :param l2_categorical_regularization: L2 Regularization for categorical + split. - :param cat_l2: L2 Regularization for categorical split. + :param random_state: Sets the random seed for LightGBM to use. :param parallel_trainer: Parallel LightGBM Learning Algorithm. @@ -120,57 +130,57 @@ class LightGbmClassifier( @trace def __init__( self, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, + number_of_leaves=None, + minimum_example_count_per_leaf=None, booster=None, normalize='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, - silent=True, - n_thread=None, - eval_metric='DefaultMetric', use_softmax=None, - early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', sigmoid=0.5, + evaluation_metric='Error', + maximum_bin_count_per_feature=255, + verbose=False, + silent=True, + number_of_threads=None, + early_stopping_round=0, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + random_state=None, parallel_trainer=None, **params): BasePipelineItem.__init__( self, type='classifier', **params) - self.num_boost_round = num_boost_round + self.number_of_iterations = number_of_iterations self.learning_rate = learning_rate - self.num_leaves = num_leaves - self.min_data_per_leaf = min_data_per_leaf + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.booster = booster self.normalize = normalize self.caching = caching - self.max_bin = max_bin - self.verbose_eval = verbose_eval - self.silent = silent - self.n_thread = n_thread - self.eval_metric = eval_metric self.use_softmax = use_softmax - self.early_stopping_round = early_stopping_round - self.custom_gains = custom_gains self.sigmoid = sigmoid + self.evaluation_metric = evaluation_metric + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature + self.verbose = verbose + self.silent = silent + self.number_of_threads = number_of_threads + self.early_stopping_round = early_stopping_round self.batch_size = batch_size - self.use_cat = use_cat - self.use_missing = use_missing - self.min_data_per_group = min_data_per_group - self.max_cat_threshold = max_cat_threshold - self.cat_smooth = cat_smooth - self.cat_l2 = cat_l2 + self.use_categorical_split = use_categorical_split + self.handle_missing_value = handle_missing_value + self.minimum_example_count_per_group = minimum_example_count_per_group + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.categorical_smoothing = categorical_smoothing + self.l2_categorical_regularization = l2_categorical_regularization + self.random_state = random_state self.parallel_trainer = parallel_trainer @property @@ -180,33 +190,33 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_boost_round=self.num_boost_round, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_iterations=self.number_of_iterations, learning_rate=self.learning_rate, - num_leaves=self.num_leaves, - min_data_per_leaf=self.min_data_per_leaf, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, booster=self.booster, normalize_features=self.normalize, caching=self.caching, - max_bin=self.max_bin, - verbose_eval=self.verbose_eval, - silent=self.silent, - n_thread=self.n_thread, - eval_metric=self.eval_metric, use_softmax=self.use_softmax, - early_stopping_round=self.early_stopping_round, - custom_gains=self.custom_gains, sigmoid=self.sigmoid, + evaluation_metric=self.evaluation_metric, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, + verbose=self.verbose, + silent=self.silent, + number_of_threads=self.number_of_threads, + early_stopping_round=self.early_stopping_round, batch_size=self.batch_size, - use_cat=self.use_cat, - use_missing=self.use_missing, - min_data_per_group=self.min_data_per_group, - max_cat_threshold=self.max_cat_threshold, - cat_smooth=self.cat_smooth, - cat_l2=self.cat_l2, + use_categorical_split=self.use_categorical_split, + handle_missing_value=self.handle_missing_value, + minimum_example_count_per_group=self.minimum_example_count_per_group, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + categorical_smoothing=self.categorical_smoothing, + l2_categorical_regularization=self.l2_categorical_regularization, + seed=self.random_state, parallel_trainer=self.parallel_trainer) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py b/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py index dbbe8623..6c06148d 100644 --- a/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py +++ b/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py @@ -35,17 +35,25 @@ class LightGbmRanker(BasePipelineItem, DefaultSignatureWithRoles): `GitHub: LightGBM `_ - :param num_boost_round: Number of iterations. - - :param learning_rate: Shrinkage rate for trees, used to prevent over- - fitting. Range: (0,1]. - - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. - - :param min_data_per_leaf: Minimum number of instances needed in a child. + :param number_of_iterations: Number of iterations. + + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. + + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. + + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param booster: Which booster to use. Available options are: @@ -61,43 +69,45 @@ class LightGbmRanker(BasePipelineItem, DefaultSignatureWithRoles): normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_bin: Max number of bucket bin for features. + :param custom_gains: An array of gains associated to each relevance label. - :param verbose_eval: Verbose. - - :param silent: Printing running messages. + :param sigmoid: Parameter for the sigmoid function. - :param n_thread: Number of parallel threads used to run LightGBM. + :param evaluation_metric: Evaluation metrics. - :param eval_metric: Evaluation metrics. + :param maximum_bin_count_per_feature: Maximum number of bucket bin for + features. - :param use_softmax: Use softmax loss for the multi classification. + :param verbose: Verbose. - :param early_stopping_round: Rounds of early stopping, 0 will disable it. + :param silent: Printing running messages. - :param custom_gains: Comma seperated list of gains associated to each - relevance label. + :param number_of_threads: Number of parallel threads used to run LightGBM. - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. + :param early_stopping_round: Rounds of early stopping, 0 will disable it. :param batch_size: Number of entries in a batch when loading data. - :param use_cat: Enable categorical split or not. + :param use_categorical_split: Enable categorical split or not. - :param use_missing: Enable missing value auto infer or not. + :param handle_missing_value: Enable special handling of missing value or + not. - :param min_data_per_group: Min number of instances per categorical group. + :param minimum_example_count_per_group: Minimum number of instances per + categorical group. - :param max_cat_threshold: Max number of categorical thresholds. + :param maximum_categorical_split_point_count: Max number of categorical + thresholds. - :param cat_smooth: Lapalace smooth term in categorical feature spilt. Avoid - the bias of small categories. + :param categorical_smoothing: Lapalace smooth term in categorical feature + spilt. Avoid the bias of small categories. - :param cat_l2: L2 Regularization for categorical split. + :param l2_categorical_regularization: L2 Regularization for categorical + split. + + :param random_state: Sets the random seed for LightGBM to use. :param parallel_trainer: Parallel LightGBM Learning Algorithm. @@ -121,56 +131,56 @@ class LightGbmRanker(BasePipelineItem, DefaultSignatureWithRoles): @trace def __init__( self, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, + number_of_leaves=None, + minimum_example_count_per_leaf=None, booster=None, normalize='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + custom_gains=[0, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095], + sigmoid=0.5, + evaluation_metric='NormalizedDiscountedCumulativeGain', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + random_state=None, parallel_trainer=None, **params): BasePipelineItem.__init__(self, type='ranker', **params) - self.num_boost_round = num_boost_round + self.number_of_iterations = number_of_iterations self.learning_rate = learning_rate - self.num_leaves = num_leaves - self.min_data_per_leaf = min_data_per_leaf + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.booster = booster self.normalize = normalize self.caching = caching - self.max_bin = max_bin - self.verbose_eval = verbose_eval - self.silent = silent - self.n_thread = n_thread - self.eval_metric = eval_metric - self.use_softmax = use_softmax - self.early_stopping_round = early_stopping_round self.custom_gains = custom_gains self.sigmoid = sigmoid + self.evaluation_metric = evaluation_metric + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature + self.verbose = verbose + self.silent = silent + self.number_of_threads = number_of_threads + self.early_stopping_round = early_stopping_round self.batch_size = batch_size - self.use_cat = use_cat - self.use_missing = use_missing - self.min_data_per_group = min_data_per_group - self.max_cat_threshold = max_cat_threshold - self.cat_smooth = cat_smooth - self.cat_l2 = cat_l2 + self.use_categorical_split = use_categorical_split + self.handle_missing_value = handle_missing_value + self.minimum_example_count_per_group = minimum_example_count_per_group + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.categorical_smoothing = categorical_smoothing + self.l2_categorical_regularization = l2_categorical_regularization + self.random_state = random_state self.parallel_trainer = parallel_trainer @property @@ -180,33 +190,33 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_boost_round=self.num_boost_round, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_iterations=self.number_of_iterations, learning_rate=self.learning_rate, - num_leaves=self.num_leaves, - min_data_per_leaf=self.min_data_per_leaf, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, booster=self.booster, normalize_features=self.normalize, caching=self.caching, - max_bin=self.max_bin, - verbose_eval=self.verbose_eval, - silent=self.silent, - n_thread=self.n_thread, - eval_metric=self.eval_metric, - use_softmax=self.use_softmax, - early_stopping_round=self.early_stopping_round, custom_gains=self.custom_gains, sigmoid=self.sigmoid, + evaluation_metric=self.evaluation_metric, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, + verbose=self.verbose, + silent=self.silent, + number_of_threads=self.number_of_threads, + early_stopping_round=self.early_stopping_round, batch_size=self.batch_size, - use_cat=self.use_cat, - use_missing=self.use_missing, - min_data_per_group=self.min_data_per_group, - max_cat_threshold=self.max_cat_threshold, - cat_smooth=self.cat_smooth, - cat_l2=self.cat_l2, + use_categorical_split=self.use_categorical_split, + handle_missing_value=self.handle_missing_value, + minimum_example_count_per_group=self.minimum_example_count_per_group, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + categorical_smoothing=self.categorical_smoothing, + l2_categorical_regularization=self.l2_categorical_regularization, + seed=self.random_state, parallel_trainer=self.parallel_trainer) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py b/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py index 36815a46..20fe5e57 100644 --- a/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py @@ -34,17 +34,25 @@ class LightGbmRegressor( `GitHub: LightGBM `_ - :param num_boost_round: Number of iterations. - - :param learning_rate: Shrinkage rate for trees, used to prevent over- - fitting. Range: (0,1]. - - :param num_leaves: The maximum number of leaves (terminal nodes) that can - be created in any tree. Higher values potentially increase the size of - the tree and get better precision, but risk overfitting and requiring - longer training times. - - :param min_data_per_leaf: Minimum number of instances needed in a child. + :param number_of_iterations: Number of iterations. + + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. + + :param number_of_leaves: The maximum number of leaves (terminal nodes) that + can be created in any tree. Higher values potentially increase the size + of the tree and get better precision, but risk overfitting and + requiring longer training times. + + :param minimum_example_count_per_leaf: Minimum number of training instances + required to form a leaf. That is, the minimal number of documents + allowed in a leaf of regression tree, out of the sub-sampled data. A + 'split' means that features in each level of the tree (node) are + randomly divided. :param booster: Which booster to use. Available options are: @@ -60,43 +68,41 @@ class LightGbmRegressor( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param max_bin: Max number of bucket bin for features. + :param evaluation_metric: Evaluation metrics. - :param verbose_eval: Verbose. - - :param silent: Printing running messages. + :param maximum_bin_count_per_feature: Maximum number of bucket bin for + features. - :param n_thread: Number of parallel threads used to run LightGBM. + :param verbose: Verbose. - :param eval_metric: Evaluation metrics. + :param silent: Printing running messages. - :param use_softmax: Use softmax loss for the multi classification. + :param number_of_threads: Number of parallel threads used to run LightGBM. :param early_stopping_round: Rounds of early stopping, 0 will disable it. - :param custom_gains: Comma seperated list of gains associated to each - relevance label. - - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. - :param batch_size: Number of entries in a batch when loading data. - :param use_cat: Enable categorical split or not. + :param use_categorical_split: Enable categorical split or not. + + :param handle_missing_value: Enable special handling of missing value or + not. - :param use_missing: Enable missing value auto infer or not. + :param minimum_example_count_per_group: Minimum number of instances per + categorical group. - :param min_data_per_group: Min number of instances per categorical group. + :param maximum_categorical_split_point_count: Max number of categorical + thresholds. - :param max_cat_threshold: Max number of categorical thresholds. + :param categorical_smoothing: Lapalace smooth term in categorical feature + spilt. Avoid the bias of small categories. - :param cat_smooth: Lapalace smooth term in categorical feature spilt. Avoid - the bias of small categories. + :param l2_categorical_regularization: L2 Regularization for categorical + split. - :param cat_l2: L2 Regularization for categorical split. + :param random_state: Sets the random seed for LightGBM to use. :param parallel_trainer: Parallel LightGBM Learning Algorithm. @@ -120,57 +126,53 @@ class LightGbmRegressor( @trace def __init__( self, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, + number_of_leaves=None, + minimum_example_count_per_leaf=None, booster=None, normalize='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + evaluation_metric='RootMeanSquaredError', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + random_state=None, parallel_trainer=None, **params): BasePipelineItem.__init__( self, type='regressor', **params) - self.num_boost_round = num_boost_round + self.number_of_iterations = number_of_iterations self.learning_rate = learning_rate - self.num_leaves = num_leaves - self.min_data_per_leaf = min_data_per_leaf + self.number_of_leaves = number_of_leaves + self.minimum_example_count_per_leaf = minimum_example_count_per_leaf self.booster = booster self.normalize = normalize self.caching = caching - self.max_bin = max_bin - self.verbose_eval = verbose_eval + self.evaluation_metric = evaluation_metric + self.maximum_bin_count_per_feature = maximum_bin_count_per_feature + self.verbose = verbose self.silent = silent - self.n_thread = n_thread - self.eval_metric = eval_metric - self.use_softmax = use_softmax + self.number_of_threads = number_of_threads self.early_stopping_round = early_stopping_round - self.custom_gains = custom_gains - self.sigmoid = sigmoid self.batch_size = batch_size - self.use_cat = use_cat - self.use_missing = use_missing - self.min_data_per_group = min_data_per_group - self.max_cat_threshold = max_cat_threshold - self.cat_smooth = cat_smooth - self.cat_l2 = cat_l2 + self.use_categorical_split = use_categorical_split + self.handle_missing_value = handle_missing_value + self.minimum_example_count_per_group = minimum_example_count_per_group + self.maximum_categorical_split_point_count = maximum_categorical_split_point_count + self.categorical_smoothing = categorical_smoothing + self.l2_categorical_regularization = l2_categorical_regularization + self.random_state = random_state self.parallel_trainer = parallel_trainer @property @@ -180,33 +182,31 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), - group_id_column=self._getattr_role('group_id_column', all_args), - num_boost_round=self.num_boost_round, + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), + row_group_column_name=self._getattr_role('row_group_column_name', all_args), + number_of_iterations=self.number_of_iterations, learning_rate=self.learning_rate, - num_leaves=self.num_leaves, - min_data_per_leaf=self.min_data_per_leaf, + number_of_leaves=self.number_of_leaves, + minimum_example_count_per_leaf=self.minimum_example_count_per_leaf, booster=self.booster, normalize_features=self.normalize, caching=self.caching, - max_bin=self.max_bin, - verbose_eval=self.verbose_eval, + evaluation_metric=self.evaluation_metric, + maximum_bin_count_per_feature=self.maximum_bin_count_per_feature, + verbose=self.verbose, silent=self.silent, - n_thread=self.n_thread, - eval_metric=self.eval_metric, - use_softmax=self.use_softmax, + number_of_threads=self.number_of_threads, early_stopping_round=self.early_stopping_round, - custom_gains=self.custom_gains, - sigmoid=self.sigmoid, batch_size=self.batch_size, - use_cat=self.use_cat, - use_missing=self.use_missing, - min_data_per_group=self.min_data_per_group, - max_cat_threshold=self.max_cat_threshold, - cat_smooth=self.cat_smooth, - cat_l2=self.cat_l2, + use_categorical_split=self.use_categorical_split, + handle_missing_value=self.handle_missing_value, + minimum_example_count_per_group=self.minimum_example_count_per_group, + maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, + categorical_smoothing=self.categorical_smoothing, + l2_categorical_regularization=self.l2_categorical_regularization, + seed=self.random_state, parallel_trainer=self.parallel_trainer) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/feature_extraction/categorical/onehothashvectorizer.py b/src/python/nimbusml/internal/core/feature_extraction/categorical/onehothashvectorizer.py index 6cfeb8c0..94de4a6b 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/categorical/onehothashvectorizer.py +++ b/src/python/nimbusml/internal/core/feature_extraction/categorical/onehothashvectorizer.py @@ -35,7 +35,7 @@ class OneHotHashVectorizer( ``OneHotHashVectorizer`` does not currently support handling factor data. - :param hash_bits: An integer specifying the number of bits to hash into. + :param number_of_bits: An integer specifying the number of bits to hash into. Must be between 1 and 30, inclusive. The default value is 16. :param output_kind: A character string that specifies the kind @@ -67,7 +67,7 @@ class OneHotHashVectorizer( :param ordered: ``True`` to include the position of each term in the hash. Otherwise, ``False``. The default value is ``True``. - :param invert_hash: An integer specifying the limit on the number of keys + :param maximum_number_of_inverts: An integer specifying the limit on the number of keys that can be used to generate the slot name. ``0`` means no invert hashing; ``-1`` means no limit. While a zero value gives better performance, a non-zero value is needed to get meaningful coefficent @@ -90,20 +90,20 @@ class OneHotHashVectorizer( @trace def __init__( self, - hash_bits=16, + number_of_bits=16, output_kind='Bag', random_state=314489979, ordered=True, - invert_hash=0, + maximum_number_of_inverts=0, **params): BasePipelineItem.__init__( self, type='transform', **params) - self.hash_bits = hash_bits + self.number_of_bits = number_of_bits self.output_kind = output_kind self.random_state = random_state self.ordered = ordered - self.invert_hash = invert_hash + self.maximum_number_of_inverts = maximum_number_of_inverts @property def _entrypoint(self): @@ -151,11 +151,11 @@ def _get_node(self, **all_args): o in zip( input_columns, output_columns)] if input_columns else None, - hash_bits=self.hash_bits, + number_of_bits=self.number_of_bits, output_kind=self.output_kind, seed=self.random_state, ordered=self.ordered, - invert_hash=self.invert_hash) + maximum_number_of_inverts=self.maximum_number_of_inverts) all_args.update(algo_args) return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/feature_extraction/categorical/onehotvectorizer.py b/src/python/nimbusml/internal/core/feature_extraction/categorical/onehotvectorizer.py index 3f813b07..22098e9f 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/categorical/onehotvectorizer.py +++ b/src/python/nimbusml/internal/core/feature_extraction/categorical/onehotvectorizer.py @@ -96,9 +96,9 @@ class OneHotVectorizer( def __init__( self, max_num_terms=1000000, - output_kind='Ind', + output_kind='Indicator', term=None, - sort='Occurrence', + sort='ByOccurrence', text_key_values=True, **params): BasePipelineItem.__init__( diff --git a/src/python/nimbusml/internal/core/feature_extraction/image/pixelextractor.py b/src/python/nimbusml/internal/core/feature_extraction/image/pixelextractor.py index 4d8164d0..ce0ea420 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/image/pixelextractor.py +++ b/src/python/nimbusml/internal/core/feature_extraction/image/pixelextractor.py @@ -41,7 +41,9 @@ class PixelExtractor(BasePipelineItem, DefaultSignature): :param use_blue: Specifies whether to use blue channel. The default value is ``True``. - :param interleave_argb: Whether to separate each channel or + :param order: Order of colors. + + :param interleave: Whether to separate each channel or interleave in ARGB order. This might be important, for example, if you are training a convolutional neural network, since this would affect the shape of @@ -78,7 +80,8 @@ def __init__( use_red=True, use_green=True, use_blue=True, - interleave_argb=False, + order='ARGB', + interleave=False, convert=True, offset=None, scale=None, @@ -90,7 +93,8 @@ def __init__( self.use_red = use_red self.use_green = use_green self.use_blue = use_blue - self.interleave_argb = interleave_argb + self.order = order + self.interleave = interleave self.convert = convert self.offset = offset self.scale = scale @@ -145,7 +149,8 @@ def _get_node(self, **all_args): use_red=self.use_red, use_green=self.use_green, use_blue=self.use_blue, - interleave_argb=self.interleave_argb, + order=self.order, + interleave=self.interleave, convert=self.convert, offset=self.offset, scale=self.scale) diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py index a137b235..07fde941 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngram.py @@ -58,12 +58,12 @@ class Ngram(Component): :param ngram_length: Ngram length. :param skip_length: Maximum number of tokens to skip when constructing an - ngram. + n-gram. - :param all_lengths: Whether to include all ngram lengths up to NgramLength + :param all_lengths: Whether to include all n-gram lengths up to NgramLength or only NgramLength. - :param max_num_terms: Maximum number of ngrams to store in the dictionary. + :param max_num_terms: Maximum number of n-grams to store in the dictionary. :param weighting: The weighting criteria. diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py index ac342e2e..cd08b4be 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/extractor/ngramhash.py @@ -58,15 +58,15 @@ class NgramHash(Component): * *term frequency-inverse document frequency* - the product term frequency and the inverse document frequency. - :param hash_bits: Number of bits to hash into. Must be between 1 and 30, - inclusive. + :param number_of_bits: Number of bits to hash into. Must be between 1 and + 30, inclusive. :param ngram_length: Ngram length. :param skip_length: Maximum number of tokens to skip when constructing an - ngram. + n-gram. - :param all_lengths: Whether to include all ngram lengths up to ngramLength + :param all_lengths: Whether to include all n-gram lengths up to ngramLength or only ngramLength. :param seed: Hashing seed. @@ -74,8 +74,9 @@ class NgramHash(Component): :param ordered: Whether the position of each source column should be included in the hash (when there are multiple source columns). - :param invert_hash: Limit the number of keys used to generate the slot name - to this many. 0 means no invert hashing, -1 means no limit. + :param maximum_number_of_inverts: Limit the number of keys used to generate + the slot name to this many. 0 means no invert hashing, -1 means no + limit. :param params: Additional arguments sent to compute engine. @@ -94,29 +95,29 @@ class NgramHash(Component): @trace def __init__( self, - hash_bits=16, + number_of_bits=16, ngram_length=1, skip_length=0, all_lengths=True, seed=314489979, ordered=True, - invert_hash=0, + maximum_number_of_inverts=0, **params): - self.hash_bits = hash_bits + self.number_of_bits = number_of_bits self.ngram_length = ngram_length self.skip_length = skip_length self.all_lengths = all_lengths self.seed = seed self.ordered = ordered - self.invert_hash = invert_hash + self.maximum_number_of_inverts = maximum_number_of_inverts self.kind = 'NgramExtractor' self.name = 'NGramHash' self.settings = {} - if hash_bits is not None: - self.settings['HashBits'] = try_set( - obj=hash_bits, + if number_of_bits is not None: + self.settings['NumberOfBits'] = try_set( + obj=number_of_bits, none_acceptable=True, is_of_type=numbers.Real) if ngram_length is not None: @@ -140,9 +141,9 @@ def __init__( if ordered is not None: self.settings['Ordered'] = try_set( obj=ordered, none_acceptable=True, is_of_type=bool) - if invert_hash is not None: - self.settings['InvertHash'] = try_set( - obj=invert_hash, + if maximum_number_of_inverts is not None: + self.settings['MaximumNumberOfInverts'] = try_set( + obj=maximum_number_of_inverts, none_acceptable=True, is_of_type=numbers.Real) diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py b/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py index 98ba5dd3..45743c1b 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/lightlda.py @@ -43,8 +43,8 @@ class LightLda(BasePipelineItem, DefaultSignature): :param num_topic: The number of topics. - :param train_threads: The number of training threads. Default value depends - on number of logical processors. + :param number_of_threads: The number of training threads. Default value + depends on number of logical processors. :param num_max_doc_token: The threshold of maximum count of tokens per doc. @@ -91,7 +91,7 @@ class LightLda(BasePipelineItem, DefaultSignature): def __init__( self, num_topic=100, - train_threads=0, + number_of_threads=0, num_max_doc_token=512, alpha_sum=100.0, beta=0.01, @@ -107,7 +107,7 @@ def __init__( self, type='transform', **params) self.num_topic = num_topic - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.num_max_doc_token = num_max_doc_token self.alpha_sum = alpha_sum self.beta = beta @@ -166,7 +166,7 @@ def _get_node(self, **all_args): input_columns, output_columns)] if input_columns else None, num_topic=self.num_topic, - num_threads=self.train_threads, + num_threads=self.number_of_threads, num_max_doc_token=self.num_max_doc_token, alpha_sum=self.alpha_sum, beta=self.beta, diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py b/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py index 2c98b362..a7599aaa 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/ngramfeaturizer.py @@ -79,7 +79,22 @@ class NGramFeaturizer(BasePipelineItem, SingleOutputSignature): * ``"Spanish"`` * ``"Japanese"``. - :param use_predefined_stop_word_remover: Use stop remover or not. + :param stop_words_remover: Specifies the stopwords remover to use. There + are + three options supported: + + * `None`: No stopwords remover is used. + * :py:class:`PredefinedStopWordsRemover + ` : + A precompiled language-specific lists + of stop words is used that includes the most common words from + Microsoft Office. + * :py:class:`CustomStopWordsRemover + ` : A + user-defined list of stopwords. It accepts + the following option: ``stopword``. + + The default value is `None`. :param text_case: Text casing using the rules of the invariant culture. Takes the @@ -101,8 +116,8 @@ class NGramFeaturizer(BasePipelineItem, SingleOutputSignature): :param keep_numbers: ``False`` to remove numbers; ``True`` to retain numbers. The default value is ``True``. - :param output_tokens: Whether to output the transformed text tokens as an - additional column. + :param output_tokens_column_name: Column containing the transformed text + tokens. :param dictionary: A dictionary of whitelisted terms which accepts the following options: @@ -182,12 +197,12 @@ class NGramFeaturizer(BasePipelineItem, SingleOutputSignature): def __init__( self, language='English', - use_predefined_stop_word_remover=False, + stop_words_remover=None, text_case='Lower', keep_diacritics=False, keep_punctuations=True, keep_numbers=True, - output_tokens=False, + output_tokens_column_name=None, dictionary=None, word_feature_extractor=n_gram( max_num_terms=[10000000]), @@ -201,12 +216,12 @@ def __init__( self, type='transform', **params) self.language = language - self.use_predefined_stop_word_remover = use_predefined_stop_word_remover + self.stop_words_remover = stop_words_remover self.text_case = text_case self.keep_diacritics = keep_diacritics self.keep_punctuations = keep_punctuations self.keep_numbers = keep_numbers - self.output_tokens = output_tokens + self.output_tokens_column_name = output_tokens_column_name self.dictionary = dictionary self.word_feature_extractor = word_feature_extractor self.char_feature_extractor = char_feature_extractor @@ -263,12 +278,12 @@ def _get_node(self, **all_args): algo_args = dict( column=column, language=self.language, - use_predefined_stop_word_remover=self.use_predefined_stop_word_remover, + stop_words_remover=self.stop_words_remover, text_case=self.text_case, keep_diacritics=self.keep_diacritics, keep_punctuations=self.keep_punctuations, keep_numbers=self.keep_numbers, - output_tokens=self.output_tokens, + output_tokens_column_name=self.output_tokens_column_name, dictionary=self.dictionary, word_feature_extractor=self.word_feature_extractor, char_feature_extractor=self.char_feature_extractor, diff --git a/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py b/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py index 691a79d3..d67df9db 100644 --- a/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py +++ b/src/python/nimbusml/internal/core/feature_extraction/text/wordembedding.py @@ -35,7 +35,7 @@ class WordEmbedding(BasePipelineItem, DefaultSignature): Available options are: 'GloVe50D', 'GloVe100D', 'GloVe200D', 'GloVe300D', 'GloVeTwitter25D', 'GloVeTwitter50D', 'GloVeTwitter100D', 'GloVeTwitter200D', 'FastTextWikipedia300D', - 'Sswe'. + 'SentimentSpecificWordEmbedding'. :param custom_lookup_table: Filename for custom word embedding model. @@ -47,10 +47,9 @@ class WordEmbedding(BasePipelineItem, DefaultSignature): <'This', 'is', 'good'>, users need to create an input column by: * concatenating columns with TX type, - * or using the ``output_tokens=True`` for ``NGramFeaturizer()`` to + * or using the ``output_tokens_column_name`` for ``NGramFeaturizer()`` to convert a column with sentences like "This is good" into <'This', - 'is', 'good'>. The column for the output token column is renamed with - a prefix of '_TranformedText'. + 'is', 'good'>. In the following example, after the ``NGramFeaturizer``, features @@ -82,7 +81,7 @@ class WordEmbedding(BasePipelineItem, DefaultSignature): @trace def __init__( self, - model_kind='Sswe', + model_kind='SentimentSpecificWordEmbedding', custom_lookup_table=None, **params): BasePipelineItem.__init__( diff --git a/src/python/nimbusml/internal/core/feature_selection/mutualinformationselector.py b/src/python/nimbusml/internal/core/feature_selection/mutualinformationselector.py index f99f23e2..a4dea0a0 100644 --- a/src/python/nimbusml/internal/core/feature_selection/mutualinformationselector.py +++ b/src/python/nimbusml/internal/core/feature_selection/mutualinformationselector.py @@ -112,8 +112,8 @@ def _get_node(self, **all_args): algo_args = dict( column=input_columns, - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), slots_in_output=self.slots_in_output, num_bins=self.num_bins) diff --git a/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py index 0492a3c9..26471467 100644 --- a/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/averagedperceptronbinaryclassifier.py @@ -95,7 +95,7 @@ class AveragedPerceptronBinaryClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'hinge' `. Other choices are :py:class:`'exp' `, :py:class:`'log' @@ -103,31 +103,36 @@ class AveragedPerceptronBinaryClassifier( `. For more information, please see the documentation page about losses, [Loss](xref:nimbusml.loss). - :param learning_rate: Learning rate. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. :param decrease_learning_rate: Decrease learning rate. - :param l2_regularizer_weight: L2 Regularization Weight. + :param l2_regularization: L2 Regularization Weight. - :param num_iterations: Number of iterations. + :param number_of_iterations: Number of iterations. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. :param reset_weights_after_x_examples: Number of examples after which weights will be reset to the current average. - :param do_lazy_updates: Instead of updating averaged weights on every - example, only update when loss is nonzero. + :param lazy_update: Instead of updating averaged weights on every example, + only update when loss is nonzero. :param recency_gain: Extra weight given to more recent updates. - :param recency_gain_multi: Whether Recency Gain is multiplicative (vs. - additive). + :param recency_gain_multiplicative: Whether Recency Gain is multiplicative + (vs. additive). :param averaged: Do averaging?. @@ -137,8 +142,6 @@ class AveragedPerceptronBinaryClassifier( :param shuffle: Whether to shuffle for each training iteration. - :param streaming_cache_size: Size of cache when trained in Scope. - :param params: Additional arguments sent to compute engine. .. seealso:: @@ -161,18 +164,17 @@ def __init__( loss='hinge', learning_rate=1.0, decrease_learning_rate=False, - l2_regularizer_weight=0.0, - num_iterations=1, - init_wts_diameter=0.0, + l2_regularization=0.0, + number_of_iterations=1, + initial_weights_diameter=0.0, reset_weights_after_x_examples=None, - do_lazy_updates=True, + lazy_update=True, recency_gain=0.0, - recency_gain_multi=False, + recency_gain_multiplicative=False, averaged=True, averaged_tolerance=0.01, initial_weights=None, shuffle=True, - streaming_cache_size=1000000, **params): BasePipelineItem.__init__( self, type='classifier', **params) @@ -186,18 +188,17 @@ def __init__( self.loss) self.learning_rate = learning_rate self.decrease_learning_rate = decrease_learning_rate - self.l2_regularizer_weight = l2_regularizer_weight - self.num_iterations = num_iterations - self.init_wts_diameter = init_wts_diameter + self.l2_regularization = l2_regularization + self.number_of_iterations = number_of_iterations + self.initial_weights_diameter = initial_weights_diameter self.reset_weights_after_x_examples = reset_weights_after_x_examples - self.do_lazy_updates = do_lazy_updates + self.lazy_update = lazy_update self.recency_gain = recency_gain - self.recency_gain_multi = recency_gain_multi + self.recency_gain_multiplicative = recency_gain_multiplicative self.averaged = averaged self.averaged_tolerance = averaged_tolerance self.initial_weights = initial_weights self.shuffle = shuffle - self.streaming_cache_size = streaming_cache_size @property def _entrypoint(self): @@ -206,11 +207,11 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), normalize_features=self.normalize, caching=self.caching, @@ -220,18 +221,17 @@ def _get_node(self, **all_args): self.loss), learning_rate=self.learning_rate, decrease_learning_rate=self.decrease_learning_rate, - l2_regularizer_weight=self.l2_regularizer_weight, - num_iterations=self.num_iterations, - init_wts_diameter=self.init_wts_diameter, + l2_regularization=self.l2_regularization, + number_of_iterations=self.number_of_iterations, + initial_weights_diameter=self.initial_weights_diameter, reset_weights_after_x_examples=self.reset_weights_after_x_examples, - do_lazy_updates=self.do_lazy_updates, + lazy_update=self.lazy_update, recency_gain=self.recency_gain, - recency_gain_multi=self.recency_gain_multi, + recency_gain_multiplicative=self.recency_gain_multiplicative, averaged=self.averaged, averaged_tolerance=self.averaged_tolerance, initial_weights=self.initial_weights, - shuffle=self.shuffle, - streaming_cache_size=self.streaming_cache_size) + shuffle=self.shuffle) all_args.update(algo_args) return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py index 8bf9c66d..10c5c2a5 100644 --- a/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/fastlinearbinaryclassifier.py @@ -70,7 +70,7 @@ class FastLinearBinaryClassifier( optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets - ``shuffle`` to ``False`` and ``train_threads`` to ``1``. + ``shuffle`` to ``False`` and ``number_of_threads`` to ``1``. **Reference** @@ -84,8 +84,8 @@ class FastLinearBinaryClassifier( shwartz13a/shalev-shwartz13a.pdf>`_ - :param l2_weight: L2 regularizer constant. By default the l2 constant is - automatically inferred based on data set. + :param l2_regularization: L2 regularizer constant. By default the l2 + constant is automatically inferred based on data set. :param l1_threshold: L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw @@ -114,7 +114,7 @@ class FastLinearBinaryClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'log' `. Other choices are :py:class:`'hinge' `, and @@ -122,7 +122,7 @@ class FastLinearBinaryClassifier( information, please see the documentation page about losses, [Loss](xref:nimbusml.loss). - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. :param positive_instance_weight: Apply weight to the positive class, for @@ -131,14 +131,15 @@ class FastLinearBinaryClassifier( :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. - :param max_iterations: Maximum number of iterations; set to 1 to simulate - online learning. Defaults to automatic. + :param maximum_number_of_iterations: Maximum number of iterations; set to 1 + to simulate online learning. Defaults to automatic. :param shuffle: Shuffle data every epoch?. - :param check_frequency: Convergence check frequency (in terms of number of - iterations). Set as negative or zero for not checking at all. If left - blank, it defaults to check after every 'numThreads' iterations. + :param convergence_check_frequency: Convergence check frequency (in terms + of number of iterations). Set as negative or zero for not checking at + all. If left blank, it defaults to check after every 'numThreads' + iterations. :param bias_learning_rate: The learning rate for adjusting bias from being regularized. @@ -162,23 +163,23 @@ class FastLinearBinaryClassifier( @trace def __init__( self, - l2_weight=None, + l2_regularization=None, l1_threshold=None, normalize='Auto', caching='Auto', loss='log', - train_threads=None, + number_of_threads=None, positive_instance_weight=1.0, convergence_tolerance=0.1, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=0.0, **params): BasePipelineItem.__init__( self, type='classifier', **params) - self.l2_weight = l2_weight + self.l2_regularization = l2_regularization self.l1_threshold = l1_threshold self.normalize = normalize self.caching = caching @@ -187,12 +188,12 @@ def __init__( 'SDCAClassificationLossFunction', self.__class__.__name__, self.loss) - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.positive_instance_weight = positive_instance_weight self.convergence_tolerance = convergence_tolerance - self.max_iterations = max_iterations + self.maximum_number_of_iterations = maximum_number_of_iterations self.shuffle = shuffle - self.check_frequency = check_frequency + self.convergence_check_frequency = convergence_check_frequency self.bias_learning_rate = bias_learning_rate @property @@ -202,13 +203,16 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), - l2_const=self.l2_weight, + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', + all_args), + l2_regularization=self.l2_regularization, l1_threshold=self.l1_threshold, normalize_features=self.normalize, caching=self.caching, @@ -216,12 +220,12 @@ def _get_node(self, **all_args): 'SDCAClassificationLossFunction', self.__class__.__name__, self.loss), - num_threads=self.train_threads, + number_of_threads=self.number_of_threads, positive_instance_weight=self.positive_instance_weight, convergence_tolerance=self.convergence_tolerance, - max_iterations=self.max_iterations, + maximum_number_of_iterations=self.maximum_number_of_iterations, shuffle=self.shuffle, - check_frequency=self.check_frequency, + convergence_check_frequency=self.convergence_check_frequency, bias_learning_rate=self.bias_learning_rate) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py b/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py index 7e5066ed..a2880b79 100644 --- a/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/fastlinearclassifier.py @@ -68,7 +68,7 @@ class FastLinearClassifier( optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets ``shuffle`` to - ``False`` and ``train_threads`` to ``1``. + ``False`` and ``number_of_threads`` to ``1``. **Reference** @@ -82,8 +82,8 @@ class FastLinearClassifier( shwartz13a/shalev-shwartz13a.pdf>`_ - :param l2_weight: L2 regularizer constant. By default the l2 constant is - automatically inferred based on data set. + :param l2_regularization: L2 regularizer constant. By default the l2 + constant is automatically inferred based on data set. :param l1_threshold: L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw @@ -112,7 +112,7 @@ class FastLinearClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'log' `. Other choices are @@ -122,20 +122,21 @@ class FastLinearClassifier( documentation page about losses, [Loss](xref:nimbusml.loss). - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. - :param max_iterations: Maximum number of iterations; set to 1 to simulate - online learning. Defaults to automatic. + :param maximum_number_of_iterations: Maximum number of iterations; set to 1 + to simulate online learning. Defaults to automatic. :param shuffle: Shuffle data every epoch?. - :param check_frequency: Convergence check frequency (in terms of number of - iterations). Set as negative or zero for not checking at all. If left - blank, it defaults to check after every 'numThreads' iterations. + :param convergence_check_frequency: Convergence check frequency (in terms + of number of iterations). Set as negative or zero for not checking at + all. If left blank, it defaults to check after every 'numThreads' + iterations. :param bias_learning_rate: The learning rate for adjusting bias from being regularized. @@ -159,22 +160,22 @@ class FastLinearClassifier( @trace def __init__( self, - l2_weight=None, + l2_regularization=None, l1_threshold=None, normalize='Auto', caching='Auto', loss='log', - train_threads=None, + number_of_threads=None, convergence_tolerance=0.1, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=0.0, **params): BasePipelineItem.__init__( self, type='classifier', **params) - self.l2_weight = l2_weight + self.l2_regularization = l2_regularization self.l1_threshold = l1_threshold self.normalize = normalize self.caching = caching @@ -183,11 +184,11 @@ def __init__( 'SDCAClassificationLossFunction', self.__class__.__name__, self.loss) - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.convergence_tolerance = convergence_tolerance - self.max_iterations = max_iterations + self.maximum_number_of_iterations = maximum_number_of_iterations self.shuffle = shuffle - self.check_frequency = check_frequency + self.convergence_check_frequency = convergence_check_frequency self.bias_learning_rate = bias_learning_rate @property @@ -197,13 +198,16 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), - l2_const=self.l2_weight, + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', + all_args), + l2_regularization=self.l2_regularization, l1_threshold=self.l1_threshold, normalize_features=self.normalize, caching=self.caching, @@ -211,11 +215,11 @@ def _get_node(self, **all_args): 'SDCAClassificationLossFunction', self.__class__.__name__, self.loss), - num_threads=self.train_threads, + number_of_threads=self.number_of_threads, convergence_tolerance=self.convergence_tolerance, - max_iterations=self.max_iterations, + maximum_number_of_iterations=self.maximum_number_of_iterations, shuffle=self.shuffle, - check_frequency=self.check_frequency, + convergence_check_frequency=self.convergence_check_frequency, bias_learning_rate=self.bias_learning_rate) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py b/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py index baa67ddb..cf9073e5 100644 --- a/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py +++ b/src/python/nimbusml/internal/core/linear_model/fastlinearregressor.py @@ -68,7 +68,7 @@ class FastLinearRegressor( optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets ``shuffle`` to - ``False`` and ``train_threads`` to ``1``. + ``False`` and ``number_of_threads`` to ``1``. **Reference** @@ -82,8 +82,8 @@ class FastLinearRegressor( shwartz13a/shalev-shwartz13a.pdf>`_ - :param l2_weight: L2 regularizer constant. By default the l2 constant is - automatically inferred based on data set. + :param l2_regularization: L2 regularizer constant. By default the l2 + constant is automatically inferred based on data set. :param l1_threshold: L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw @@ -112,26 +112,27 @@ class FastLinearRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The only supported loss is :py:class:`'squared' `. For more information, please see the documentation page about losses, [Loss](xref:nimbusml.loss). - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. - :param max_iterations: Maximum number of iterations; set to 1 to simulate - online learning. Defaults to automatic. + :param maximum_number_of_iterations: Maximum number of iterations; set to 1 + to simulate online learning. Defaults to automatic. :param shuffle: Shuffle data every epoch?. - :param check_frequency: Convergence check frequency (in terms of number of - iterations). Set as negative or zero for not checking at all. If left - blank, it defaults to check after every 'numThreads' iterations. + :param convergence_check_frequency: Convergence check frequency (in terms + of number of iterations). Set as negative or zero for not checking at + all. If left blank, it defaults to check after every 'numThreads' + iterations. :param bias_learning_rate: The learning rate for adjusting bias from being regularized. @@ -155,22 +156,22 @@ class FastLinearRegressor( @trace def __init__( self, - l2_weight=None, + l2_regularization=None, l1_threshold=None, normalize='Auto', caching='Auto', loss='squared', - train_threads=None, + number_of_threads=None, convergence_tolerance=0.01, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=1.0, **params): BasePipelineItem.__init__( self, type='regressor', **params) - self.l2_weight = l2_weight + self.l2_regularization = l2_regularization self.l1_threshold = l1_threshold self.normalize = normalize self.caching = caching @@ -179,11 +180,11 @@ def __init__( 'SDCARegressionLossFunction', self.__class__.__name__, self.loss) - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.convergence_tolerance = convergence_tolerance - self.max_iterations = max_iterations + self.maximum_number_of_iterations = maximum_number_of_iterations self.shuffle = shuffle - self.check_frequency = check_frequency + self.convergence_check_frequency = convergence_check_frequency self.bias_learning_rate = bias_learning_rate @property @@ -193,13 +194,16 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), - l2_const=self.l2_weight, + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', + all_args), + l2_regularization=self.l2_regularization, l1_threshold=self.l1_threshold, normalize_features=self.normalize, caching=self.caching, @@ -207,11 +211,11 @@ def _get_node(self, **all_args): 'SDCARegressionLossFunction', self.__class__.__name__, self.loss), - num_threads=self.train_threads, + number_of_threads=self.number_of_threads, convergence_tolerance=self.convergence_tolerance, - max_iterations=self.max_iterations, + maximum_number_of_iterations=self.maximum_number_of_iterations, shuffle=self.shuffle, - check_frequency=self.check_frequency, + convergence_check_frequency=self.convergence_check_frequency, bias_learning_rate=self.bias_learning_rate) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py index f410b3cc..098c92e9 100644 --- a/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/logisticregressionbinaryclassifier.py @@ -112,16 +112,18 @@ class LogisticRegressionBinaryClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param l2_weight: L2 regularization weight. + :param show_training_statistics: Show statistics of training examples. - :param l1_weight: L1 regularization weight. + :param l2_regularization: L2 regularization weight. - :param opt_tol: Tolerance parameter for optimization convergence. Low = - slower, more accurate. + :param l1_regularization: L1 regularization weight. - :param memory_size: Memory size for L-BFGS. Lower=faster, less accurate. + :param optimization_tolerance: Tolerance parameter for optimization + convergence. Low = slower, more accurate. + + :param history_size: Memory size for L-BFGS. Lower=faster, less accurate. The technique used for optimization here is L-BFGS, which uses only a limited amount of memory to compute the next step direction. This parameter indicates the number of past positions and gradients to store @@ -132,23 +134,23 @@ class LogisticRegressionBinaryClassifier( however, does not put any constraint on the bias term; that is, the bias term can be still a negtaive number. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. - :param max_iterations: Maximum iterations. + :param maximum_number_of_iterations: Maximum iterations. - :param sgd_init_tol: Run SGD to initialize LR weights, converging to this - tolerance. + :param stochastic_gradient_descent_initilaization_tolerance: Run SGD to + initialize LR weights, converging to this tolerance. :param quiet: If set to true, produce no output during training. :param use_threads: Whether or not to use threads. Default is true. - :param train_threads: Number of threads. + :param number_of_threads: Number of threads. :param dense_optimizer: If ``True``, forces densification of the internal optimization vectors. If ``False``, enables the logistic regression @@ -176,17 +178,18 @@ def __init__( self, normalize='Auto', caching='Auto', - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + show_training_statistics=False, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_init_tol=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - train_threads=None, + number_of_threads=None, dense_optimizer=False, **params): BasePipelineItem.__init__( @@ -194,17 +197,18 @@ def __init__( self.normalize = normalize self.caching = caching - self.l2_weight = l2_weight - self.l1_weight = l1_weight - self.opt_tol = opt_tol - self.memory_size = memory_size + self.show_training_statistics = show_training_statistics + self.l2_regularization = l2_regularization + self.l1_regularization = l1_regularization + self.optimization_tolerance = optimization_tolerance + self.history_size = history_size self.enforce_non_negativity = enforce_non_negativity - self.init_wts_diameter = init_wts_diameter - self.max_iterations = max_iterations - self.sgd_init_tol = sgd_init_tol + self.initial_weights_diameter = initial_weights_diameter + self.maximum_number_of_iterations = maximum_number_of_iterations + self.stochastic_gradient_descent_initilaization_tolerance = stochastic_gradient_descent_initilaization_tolerance self.quiet = quiet self.use_threads = use_threads - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.dense_optimizer = dense_optimizer @property @@ -214,22 +218,23 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), normalize_features=self.normalize, caching=self.caching, - l2_weight=self.l2_weight, - l1_weight=self.l1_weight, - opt_tol=self.opt_tol, - memory_size=self.memory_size, + show_training_statistics=self.show_training_statistics, + l2_regularization=self.l2_regularization, + l1_regularization=self.l1_regularization, + optimization_tolerance=self.optimization_tolerance, + history_size=self.history_size, enforce_non_negativity=self.enforce_non_negativity, - init_wts_diameter=self.init_wts_diameter, - max_iterations=self.max_iterations, - sgd_initialization_tolerance=self.sgd_init_tol, + initial_weights_diameter=self.initial_weights_diameter, + maximum_number_of_iterations=self.maximum_number_of_iterations, + stochastic_gradient_descent_initilaization_tolerance=self.stochastic_gradient_descent_initilaization_tolerance, quiet=self.quiet, use_threads=self.use_threads, - num_threads=self.train_threads, + number_of_threads=self.number_of_threads, dense_optimizer=self.dense_optimizer) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py b/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py index eb58c4c2..90af2ffb 100644 --- a/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/logisticregressionclassifier.py @@ -113,16 +113,18 @@ class LogisticRegressionClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param l2_weight: L2 regularization weight. + :param show_training_statistics: Show statistics of training examples. - :param l1_weight: L1 regularization weight. + :param l2_regularization: L2 regularization weight. - :param opt_tol: Tolerance parameter for optimization convergence. Low = - slower, more accurate. + :param l1_regularization: L1 regularization weight. - :param memory_size: Memory size for L-BFGS. Lower=faster, less accurate. + :param optimization_tolerance: Tolerance parameter for optimization + convergence. Low = slower, more accurate. + + :param history_size: Memory size for L-BFGS. Lower=faster, less accurate. The technique used for optimization here is L-BFGS, which uses only a limited amount of memory to compute the next step direction. This parameter indicates the number of past positions and gradients to store @@ -133,23 +135,23 @@ class LogisticRegressionClassifier( however, does not put any constraint on the bias term; that is, the bias term can be still a negtaive number. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. - :param max_iterations: Maximum iterations. + :param maximum_number_of_iterations: Maximum iterations. - :param sgd_init_tol: Run SGD to initialize LR weights, converging to this - tolerance. + :param stochastic_gradient_descent_initilaization_tolerance: Run SGD to + initialize LR weights, converging to this tolerance. :param quiet: If set to true, produce no output during training. :param use_threads: Whether or not to use threads. Default is true. - :param train_threads: Number of threads. + :param number_of_threads: Number of threads. :param dense_optimizer: If ``True``, forces densification of the internal optimization vectors. If ``False``, enables the logistic regression @@ -177,17 +179,18 @@ def __init__( self, normalize='Auto', caching='Auto', - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + show_training_statistics=False, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_init_tol=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - train_threads=None, + number_of_threads=None, dense_optimizer=False, **params): BasePipelineItem.__init__( @@ -195,17 +198,18 @@ def __init__( self.normalize = normalize self.caching = caching - self.l2_weight = l2_weight - self.l1_weight = l1_weight - self.opt_tol = opt_tol - self.memory_size = memory_size + self.show_training_statistics = show_training_statistics + self.l2_regularization = l2_regularization + self.l1_regularization = l1_regularization + self.optimization_tolerance = optimization_tolerance + self.history_size = history_size self.enforce_non_negativity = enforce_non_negativity - self.init_wts_diameter = init_wts_diameter - self.max_iterations = max_iterations - self.sgd_init_tol = sgd_init_tol + self.initial_weights_diameter = initial_weights_diameter + self.maximum_number_of_iterations = maximum_number_of_iterations + self.stochastic_gradient_descent_initilaization_tolerance = stochastic_gradient_descent_initilaization_tolerance self.quiet = quiet self.use_threads = use_threads - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.dense_optimizer = dense_optimizer @property @@ -215,22 +219,23 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), normalize_features=self.normalize, caching=self.caching, - l2_weight=self.l2_weight, - l1_weight=self.l1_weight, - opt_tol=self.opt_tol, - memory_size=self.memory_size, + show_training_statistics=self.show_training_statistics, + l2_regularization=self.l2_regularization, + l1_regularization=self.l1_regularization, + optimization_tolerance=self.optimization_tolerance, + history_size=self.history_size, enforce_non_negativity=self.enforce_non_negativity, - init_wts_diameter=self.init_wts_diameter, - max_iterations=self.max_iterations, - sgd_initialization_tolerance=self.sgd_init_tol, + initial_weights_diameter=self.initial_weights_diameter, + maximum_number_of_iterations=self.maximum_number_of_iterations, + stochastic_gradient_descent_initilaization_tolerance=self.stochastic_gradient_descent_initilaization_tolerance, quiet=self.quiet, use_threads=self.use_threads, - num_threads=self.train_threads, + number_of_threads=self.number_of_threads, dense_optimizer=self.dense_optimizer) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/linear_model/onlinegradientdescentregressor.py b/src/python/nimbusml/internal/core/linear_model/onlinegradientdescentregressor.py index 6956fb5f..4045c4d1 100644 --- a/src/python/nimbusml/internal/core/linear_model/onlinegradientdescentregressor.py +++ b/src/python/nimbusml/internal/core/linear_model/onlinegradientdescentregressor.py @@ -67,7 +67,7 @@ class OnlineGradientDescentRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'hinge' `. Other choices are :py:class:`'exp' `, @@ -75,32 +75,37 @@ class OnlineGradientDescentRegressor( `. For more information, please see :py:class:`'loss' `. - :param learning_rate: Learning rate. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. :param decrease_learning_rate: Decrease learning rate. - :param l2_regularizer_weight: L2 Regularization Weight. + :param l2_regularization: L2 Regularization Weight. - :param num_iterations: Number of iterations. + :param number_of_iterations: Number of iterations. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. :param reset_weights_after_x_examples: Number of examples after which weights will be reset to the current average. - :param do_lazy_updates: Instead of updating averaged weights on every - example, only update when loss is nonzero. + :param lazy_update: Instead of updating averaged weights on every example, + only update when loss is nonzero. :param recency_gain: Extra weight given to more recent updates (`do_lazy_updates`` must be **False**). - :param recency_gain_multi: Whether Recency Gain is multiplicative vs. - additive (`do_lazy_updates`` must be **False**). + :param recency_gain_multiplicative: Whether Recency Gain is multiplicative + (vs. additive). :param averaged: Do averaging?. @@ -110,8 +115,6 @@ class OnlineGradientDescentRegressor( :param shuffle: Whether to shuffle for each training iteration. - :param streaming_cache_size: Size of cache when trained in Scope. - :param params: Additional arguments sent to compute engine. .. seealso:: @@ -137,18 +140,17 @@ def __init__( loss='squared', learning_rate=0.1, decrease_learning_rate=True, - l2_regularizer_weight=0.0, - num_iterations=1, - init_wts_diameter=0.0, + l2_regularization=0.0, + number_of_iterations=1, + initial_weights_diameter=0.0, reset_weights_after_x_examples=None, - do_lazy_updates=True, + lazy_update=True, recency_gain=0.0, - recency_gain_multi=False, + recency_gain_multiplicative=False, averaged=True, averaged_tolerance=0.01, initial_weights=None, shuffle=True, - streaming_cache_size=1000000, **params): BasePipelineItem.__init__( self, type='regressor', **params) @@ -162,18 +164,17 @@ def __init__( self.loss) self.learning_rate = learning_rate self.decrease_learning_rate = decrease_learning_rate - self.l2_regularizer_weight = l2_regularizer_weight - self.num_iterations = num_iterations - self.init_wts_diameter = init_wts_diameter + self.l2_regularization = l2_regularization + self.number_of_iterations = number_of_iterations + self.initial_weights_diameter = initial_weights_diameter self.reset_weights_after_x_examples = reset_weights_after_x_examples - self.do_lazy_updates = do_lazy_updates + self.lazy_update = lazy_update self.recency_gain = recency_gain - self.recency_gain_multi = recency_gain_multi + self.recency_gain_multiplicative = recency_gain_multiplicative self.averaged = averaged self.averaged_tolerance = averaged_tolerance self.initial_weights = initial_weights self.shuffle = shuffle - self.streaming_cache_size = streaming_cache_size @property def _entrypoint(self): @@ -182,11 +183,11 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), normalize_features=self.normalize, caching=self.caching, @@ -196,18 +197,17 @@ def _get_node(self, **all_args): self.loss), learning_rate=self.learning_rate, decrease_learning_rate=self.decrease_learning_rate, - l2_regularizer_weight=self.l2_regularizer_weight, - num_iterations=self.num_iterations, - init_wts_diameter=self.init_wts_diameter, + l2_regularization=self.l2_regularization, + number_of_iterations=self.number_of_iterations, + initial_weights_diameter=self.initial_weights_diameter, reset_weights_after_x_examples=self.reset_weights_after_x_examples, - do_lazy_updates=self.do_lazy_updates, + lazy_update=self.lazy_update, recency_gain=self.recency_gain, - recency_gain_multi=self.recency_gain_multi, + recency_gain_multiplicative=self.recency_gain_multiplicative, averaged=self.averaged, averaged_tolerance=self.averaged_tolerance, initial_weights=self.initial_weights, - shuffle=self.shuffle, - streaming_cache_size=self.streaming_cache_size) + shuffle=self.shuffle) all_args.update(algo_args) return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/linear_model/ordinaryleastsquaresregressor.py b/src/python/nimbusml/internal/core/linear_model/ordinaryleastsquaresregressor.py index 0d73488f..39e59f43 100644 --- a/src/python/nimbusml/internal/core/linear_model/ordinaryleastsquaresregressor.py +++ b/src/python/nimbusml/internal/core/linear_model/ordinaryleastsquaresregressor.py @@ -62,11 +62,11 @@ class OrdinaryLeastSquaresRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param l2_weight: L2 regularization weight. + :param l2_regularization: L2 regularization weight. - :param per_parameter_significance: Whether to calculate per parameter + :param calculate_statistics: Whether to calculate per parameter significance statistics. :param params: Additional arguments sent to compute engine. @@ -91,16 +91,16 @@ def __init__( self, normalize='Auto', caching='Auto', - l2_weight=1e-06, - per_parameter_significance=True, + l2_regularization=1e-06, + calculate_statistics=True, **params): BasePipelineItem.__init__( self, type='regressor', **params) self.normalize = normalize self.caching = caching - self.l2_weight = l2_weight - self.per_parameter_significance = per_parameter_significance + self.l2_regularization = l2_regularization + self.calculate_statistics = calculate_statistics @property def _entrypoint(self): @@ -109,13 +109,19 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', + all_args), normalize_features=self.normalize, caching=self.caching, - l2_weight=self.l2_weight, - per_parameter_significance=self.per_parameter_significance) + l2_regularization=self.l2_regularization, + calculate_statistics=self.calculate_statistics) all_args.update(algo_args) return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/linear_model/poissonregressionregressor.py b/src/python/nimbusml/internal/core/linear_model/poissonregressionregressor.py index fee9a526..a313f2b4 100644 --- a/src/python/nimbusml/internal/core/linear_model/poissonregressionregressor.py +++ b/src/python/nimbusml/internal/core/linear_model/poissonregressionregressor.py @@ -62,16 +62,16 @@ class PoissonRegressionRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param l2_weight: L2 regularization weight. + :param l2_regularization: L2 regularization weight. - :param l1_weight: L1 regularization weight. + :param l1_regularization: L1 regularization weight. - :param opt_tol: Tolerance parameter for optimization convergence. Low = - slower, more accurate. + :param optimization_tolerance: Tolerance parameter for optimization + convergence. Low = slower, more accurate. - :param memory_size: Memory size for L-BFGS. Lower=faster, less accurate. + :param history_size: Memory size for L-BFGS. Lower=faster, less accurate. The technique used for optimization here is L-BFGS, which uses only a limited amount of memory to compute the next step direction. This parameter indicates the number of past positions and gradients to store @@ -82,23 +82,23 @@ class PoissonRegressionRegressor( however, does not put any constraint on the bias term; that is, the bias term can be still a negtaive number. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. - :param max_iterations: Maximum iterations. + :param maximum_number_of_iterations: Maximum iterations. - :param sgd_init_tol: Run SGD to initialize LR weights, converging to this - tolerance. + :param stochastic_gradient_descent_initilaization_tolerance: Run SGD to + initialize LR weights, converging to this tolerance. :param quiet: If set to true, produce no output during training. :param use_threads: Whether or not to use threads. Default is true. - :param train_threads: Number of threads. + :param number_of_threads: Number of threads. :param dense_optimizer: If ``True``, forces densification of the internal optimization vectors. If ``False``, enables the logistic regression @@ -131,17 +131,17 @@ def __init__( self, normalize='Auto', caching='Auto', - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_init_tol=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - train_threads=None, + number_of_threads=None, dense_optimizer=False, **params): BasePipelineItem.__init__( @@ -149,17 +149,17 @@ def __init__( self.normalize = normalize self.caching = caching - self.l2_weight = l2_weight - self.l1_weight = l1_weight - self.opt_tol = opt_tol - self.memory_size = memory_size + self.l2_regularization = l2_regularization + self.l1_regularization = l1_regularization + self.optimization_tolerance = optimization_tolerance + self.history_size = history_size self.enforce_non_negativity = enforce_non_negativity - self.init_wts_diameter = init_wts_diameter - self.max_iterations = max_iterations - self.sgd_init_tol = sgd_init_tol + self.initial_weights_diameter = initial_weights_diameter + self.maximum_number_of_iterations = maximum_number_of_iterations + self.stochastic_gradient_descent_initilaization_tolerance = stochastic_gradient_descent_initilaization_tolerance self.quiet = quiet self.use_threads = use_threads - self.train_threads = train_threads + self.number_of_threads = number_of_threads self.dense_optimizer = dense_optimizer @property @@ -169,22 +169,22 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), - weight_column=self._getattr_role('weight_column', all_args), + feature_column_name=self._getattr_role('feature_column_name', all_args), + label_column_name=self._getattr_role('label_column_name', all_args), + example_weight_column_name=self._getattr_role('example_weight_column_name', all_args), normalize_features=self.normalize, caching=self.caching, - l2_weight=self.l2_weight, - l1_weight=self.l1_weight, - opt_tol=self.opt_tol, - memory_size=self.memory_size, + l2_regularization=self.l2_regularization, + l1_regularization=self.l1_regularization, + optimization_tolerance=self.optimization_tolerance, + history_size=self.history_size, enforce_non_negativity=self.enforce_non_negativity, - init_wts_diameter=self.init_wts_diameter, - max_iterations=self.max_iterations, - sgd_initialization_tolerance=self.sgd_init_tol, + initial_weights_diameter=self.initial_weights_diameter, + maximum_number_of_iterations=self.maximum_number_of_iterations, + stochastic_gradient_descent_initilaization_tolerance=self.stochastic_gradient_descent_initilaization_tolerance, quiet=self.quiet, use_threads=self.use_threads, - num_threads=self.train_threads, + number_of_threads=self.number_of_threads, dense_optimizer=self.dense_optimizer) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py index 2af47365..b0c5e898 100644 --- a/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/sgdbinaryclassifier.py @@ -67,7 +67,7 @@ class SgdBinaryClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'log' `. Other choices are :py:class:`'exp' `, :py:class:`'hinge' @@ -75,18 +75,18 @@ class SgdBinaryClassifier( `. For more information, please see the documentation page about losses, [Loss](xref:nimbusml.loss). - :param l2_weight: L2 Regularization constant. + :param l2_regularization: L2 Regularization constant. - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed. :param convergence_tolerance: Exponential moving averaged improvement tolerance for convergence. - :param max_iterations: Maximum number of iterations; set to 1 to simulate - online learning. + :param number_of_iterations: Maximum number of iterations; set to 1 to + simulate online learning. - :param init_learning_rate: Initial learning rate (only used by SGD). + :param initial_learning_rate: Initial learning rate (only used by SGD). :param shuffle: Shuffle data every epoch?. @@ -117,11 +117,11 @@ def __init__( normalize='Auto', caching='Auto', loss='log', - l2_weight=1e-06, - train_threads=None, + l2_regularization=1e-06, + number_of_threads=None, convergence_tolerance=0.0001, - max_iterations=20, - init_learning_rate=0.01, + number_of_iterations=20, + initial_learning_rate=0.01, shuffle=True, positive_instance_weight=1.0, check_frequency=None, @@ -136,11 +136,11 @@ def __init__( 'ClassificationLossFunction', self.__class__.__name__, self.loss) - self.l2_weight = l2_weight - self.train_threads = train_threads + self.l2_regularization = l2_regularization + self.number_of_threads = number_of_threads self.convergence_tolerance = convergence_tolerance - self.max_iterations = max_iterations - self.init_learning_rate = init_learning_rate + self.number_of_iterations = number_of_iterations + self.initial_learning_rate = initial_learning_rate self.shuffle = shuffle self.positive_instance_weight = positive_instance_weight self.check_frequency = check_frequency @@ -152,14 +152,14 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), - weight_column=self._getattr_role( - 'weight_column', + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', all_args), normalize_features=self.normalize, caching=self.caching, @@ -167,11 +167,11 @@ def _get_node(self, **all_args): 'ClassificationLossFunction', self.__class__.__name__, self.loss), - l2_weight=self.l2_weight, - num_threads=self.train_threads, + l2_regularization=self.l2_regularization, + number_of_threads=self.number_of_threads, convergence_tolerance=self.convergence_tolerance, - max_iterations=self.max_iterations, - init_learning_rate=self.init_learning_rate, + number_of_iterations=self.number_of_iterations, + initial_learning_rate=self.initial_learning_rate, shuffle=self.shuffle, positive_instance_weight=self.positive_instance_weight, check_frequency=self.check_frequency) diff --git a/src/python/nimbusml/internal/core/linear_model/symsgdbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/symsgdbinaryclassifier.py index 01affd9e..7f7775c7 100644 --- a/src/python/nimbusml/internal/core/linear_model/symsgdbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/symsgdbinaryclassifier.py @@ -66,11 +66,16 @@ class SymSgdBinaryClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param number_of_iterations: Number of passes over the data. - :param learning_rate: Learning rate. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. :param l2_regularization: L2 regularization. @@ -151,8 +156,12 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role('feature_column', all_args), - label_column=self._getattr_role('label_column', all_args), + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), normalize_features=self.normalize, caching=self.caching, number_of_iterations=self.number_of_iterations, diff --git a/src/python/nimbusml/internal/core/multiclass/onevsrestclassifier.py b/src/python/nimbusml/internal/core/multiclass/onevsrestclassifier.py index d245cf17..0b827a70 100644 --- a/src/python/nimbusml/internal/core/multiclass/onevsrestclassifier.py +++ b/src/python/nimbusml/internal/core/multiclass/onevsrestclassifier.py @@ -48,7 +48,7 @@ class OneVsRestClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param params: Additional arguments sent to compute engine. @@ -115,14 +115,14 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), - weight_column=self._getattr_role( - 'weight_column', + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', all_args), nodes=self.classifier, output_for_sub_graph=self.output_for_sub_graph, diff --git a/src/python/nimbusml/internal/core/naive_bayes/naivebayesclassifier.py b/src/python/nimbusml/internal/core/naive_bayes/naivebayesclassifier.py index e9ffcfd6..a926594d 100644 --- a/src/python/nimbusml/internal/core/naive_bayes/naivebayesclassifier.py +++ b/src/python/nimbusml/internal/core/naive_bayes/naivebayesclassifier.py @@ -63,7 +63,7 @@ class NaiveBayesClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param params: Additional arguments sent to compute engine. @@ -100,11 +100,11 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - feature_column=self._getattr_role( - 'feature_column', + feature_column_name=self._getattr_role( + 'feature_column_name', all_args), - label_column=self._getattr_role( - 'label_column', + label_column_name=self._getattr_role( + 'label_column_name', all_args), normalize_features=self.normalize, caching=self.caching) diff --git a/src/python/nimbusml/internal/core/preprocessing/tensorflowscorer.py b/src/python/nimbusml/internal/core/preprocessing/tensorflowscorer.py index 3adbea5b..29a82109 100644 --- a/src/python/nimbusml/internal/core/preprocessing/tensorflowscorer.py +++ b/src/python/nimbusml/internal/core/preprocessing/tensorflowscorer.py @@ -13,12 +13,10 @@ from ...entrypoints.transforms_tensorflowscorer import \ transforms_tensorflowscorer from ...utils.utils import trace -from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles +from ..base_pipeline_item import BasePipelineItem, DefaultSignature -class TensorFlowScorer( - BasePipelineItem, - DefaultSignatureWithRoles): +class TensorFlowScorer(BasePipelineItem, DefaultSignature): """ Transforms the data using the @@ -54,6 +52,8 @@ class TensorFlowScorer( :param output_columns: The name of the outputs. + :param label_column: Training labels. + :param tensor_flow_label: TensorFlow label node. :param optimization_operation: The name of the optimization operation in @@ -72,7 +72,12 @@ class TensorFlowScorer( :param learning_rate_operation: The name of the operation in the TensorFlow graph which sets optimizer learning rate (Optional). - :param learning_rate: Learning rate to use during optimization. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. :param save_location_operation: Name of the input in TensorFlow graph that specifiy the location for saving/restoring models from disk. @@ -82,6 +87,9 @@ class TensorFlowScorer( :param re_train: Retrain TensorFlow model. + :param add_batch_dimension_inputs: Add a batch dimension to the input e.g. + input = [224, 224, 3] => [-1, 224, 224, 3]. + :param params: Additional arguments sent to compute engine. .. index:: transform @@ -97,6 +105,7 @@ def __init__( model_location, input_columns=None, output_columns=None, + label_column=None, tensor_flow_label=None, optimization_operation=None, loss_operation=None, @@ -108,6 +117,7 @@ def __init__( save_location_operation='save/Const', save_operation='save/control_dependency', re_train=False, + add_batch_dimension_inputs=False, **params): BasePipelineItem.__init__( self, type='transform', **params) @@ -115,6 +125,7 @@ def __init__( self.model_location = model_location self.input_columns = input_columns self.output_columns = output_columns + self.label_column = label_column self.tensor_flow_label = tensor_flow_label self.optimization_operation = optimization_operation self.loss_operation = loss_operation @@ -126,6 +137,7 @@ def __init__( self.save_location_operation = save_location_operation self.save_operation = save_operation self.re_train = re_train + self.add_batch_dimension_inputs = add_batch_dimension_inputs @property def _entrypoint(self): @@ -134,10 +146,10 @@ def _entrypoint(self): @trace def _get_node(self, **all_args): algo_args = dict( - label_column=self._getattr_role('label_column', all_args), model_location=self.model_location, input_columns=self.input_columns, output_columns=self.output_columns, + label_column=self.label_column, tensor_flow_label=self.tensor_flow_label, optimization_operation=self.optimization_operation, loss_operation=self.loss_operation, @@ -148,7 +160,8 @@ def _get_node(self, **all_args): learning_rate=self.learning_rate, save_location_operation=self.save_location_operation, save_operation=self.save_operation, - re_train=self.re_train) + re_train=self.re_train, + add_batch_dimension_inputs=self.add_batch_dimension_inputs) all_args.update(algo_args) return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/tokey.py b/src/python/nimbusml/internal/core/preprocessing/tokey.py index f57b997f..55cd7200 100644 --- a/src/python/nimbusml/internal/core/preprocessing/tokey.py +++ b/src/python/nimbusml/internal/core/preprocessing/tokey.py @@ -28,7 +28,7 @@ class ToKey(BasePipelineItem, DefaultSignature): :py:class:`FromKey ` to obtain the orginal values. - :param max_num_terms: Maximum number of terms to keep per column when auto- + :param max_num_terms: Maximum number of keys to keep per column when auto- training. :param term: List of terms. @@ -64,7 +64,7 @@ def __init__( self, max_num_terms=1000000, term=None, - sort='Occurrence', + sort='ByOccurrence', text_key_values=False, **params): BasePipelineItem.__init__( diff --git a/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_dart.py b/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_dart.py index e3ed2970..0db3dfe1 100644 --- a/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_dart.py +++ b/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_dart.py @@ -10,92 +10,90 @@ def dart( - drop_rate=0.1, - max_drop=1, - skip_drop=0.5, + tree_drop_fraction=0.1, + maximum_number_of_dropped_trees_per_round=1, + skip_drop_fraction=0.5, xgboost_dart_mode=False, uniform_drop=False, - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): """ **Description** Dropouts meet Multiple Additive Regresion Trees. See https://arxiv.org/abs/1505.01866 - :param drop_rate: Drop ratio for trees. Range:(0,1). (settings). - :param max_drop: Max number of dropped tree in a boosting round. + :param tree_drop_fraction: The drop ratio for trees. Range:(0,1). (settings). - :param skip_drop: Probability for not perform dropping in a + :param maximum_number_of_dropped_trees_per_round: Maximum number + of dropped trees in a boosting round. (settings). + :param skip_drop_fraction: Probability for not dropping in a boosting round. (settings). :param xgboost_dart_mode: True will enable xgboost dart mode. (settings). :param uniform_drop: True will enable uniform drop. (settings). - :param unbalanced_sets: Use for binary classification when - classes are not balanced. (settings). - :param min_split_gain: Minimum loss reduction required to make a - further partition on a leaf node of the tree. the larger, the - more conservative the algorithm will be. (settings). - :param max_depth: Maximum depth of a tree. 0 means no limit. - However, tree still grows by best-first. (settings). - :param min_child_weight: Minimum sum of instance weight(hessian) - needed in a child. If the tree partition step results in a - leaf node with the sum of instance weight less than - min_child_weight, then the building process will give up + :param minimum_split_gain: Minimum loss reduction required to + make a further partition on a leaf node of the tree. the + larger, the more conservative the algorithm will be. + (settings). + :param maximum_tree_depth: Maximum depth of a tree. 0 means no + limit. However, tree still grows by best-first. (settings). + :param minimum_child_weight: Minimum sum of instance + weight(hessian) needed in a child. If the tree partition step + results in a leaf node with the sum of instance weight less + than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. (settings). - :param subsample_freq: Subsample frequency. 0 means no subsample. - If subsampleFreq > 0, it will use a subset(ratio=subsample) - to train. And the subset will be updated on every Subsample - iteratinos. (settings). - :param subsample: Subsample ratio of the training instance. - Setting it to 0.5 means that LightGBM randomly collected half - of the data instances to grow trees and this will prevent - overfitting. Range: (0,1]. (settings). + :param subsample_frequency: Subsample frequency for bagging. 0 + means no subsample. Specifies the frequency at which the + bagging occurs, where if this is set to N, the subsampling + will happen at every N iterations.This must be set with + Subsample as this specifies the amount to subsample. + (settings). + :param subsample_fraction: Subsample ratio of the training + instance. Setting it to 0.5 means that LightGBM randomly + collected half of the data instances to grow trees and this + will prevent overfitting. Range: (0,1]. (settings). :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. (settings). - :param reg_lambda: L2 regularization term on weights, increasing - this value will make model more conservative. (settings). - :param reg_alpha: L1 regularization term on weights, increase - this value will make model more conservative. (settings). - :param scale_pos_weight: Control the balance of positive and - negative weights, useful for unbalanced classes. A typical - value to consider: sum(negative cases) / sum(positive cases). + :param l2_regularization: L2 regularization term on weights, + increasing this value will make model more conservative. + (settings). + :param l1_regularization: L1 regularization term on weights, + increase this value will make model more conservative. (settings). """ entrypoint_name = 'dart' settings = {} - if drop_rate is not None: - settings['DropRate'] = try_set( - obj=drop_rate, + if tree_drop_fraction is not None: + settings['TreeDropFraction'] = try_set( + obj=tree_drop_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if max_drop is not None: - settings['MaxDrop'] = try_set( - obj=max_drop, + if maximum_number_of_dropped_trees_per_round is not None: + settings['MaximumNumberOfDroppedTreesPerRound'] = try_set( + obj=maximum_number_of_dropped_trees_per_round, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if skip_drop is not None: - settings['SkipDrop'] = try_set( - obj=skip_drop, + if skip_drop_fraction is not None: + settings['SkipDropFraction'] = try_set( + obj=skip_drop_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ @@ -107,38 +105,35 @@ def dart( if uniform_drop is not None: settings['UniformDrop'] = try_set( obj=uniform_drop, none_acceptable=True, is_of_type=bool) - if unbalanced_sets is not None: - settings['UnbalancedSets'] = try_set( - obj=unbalanced_sets, none_acceptable=True, is_of_type=bool) - if min_split_gain is not None: - settings['MinSplitGain'] = try_set( - obj=min_split_gain, + if minimum_split_gain is not None: + settings['MinimumSplitGain'] = try_set( + obj=minimum_split_gain, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if max_depth is not None: - settings['MaxDepth'] = try_set( - obj=max_depth, + if maximum_tree_depth is not None: + settings['MaximumTreeDepth'] = try_set( + obj=maximum_tree_depth, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if min_child_weight is not None: - settings['MinChildWeight'] = try_set( - obj=min_child_weight, + if minimum_child_weight is not None: + settings['MinimumChildWeight'] = try_set( + obj=minimum_child_weight, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if subsample_freq is not None: - settings['SubsampleFreq'] = try_set( - obj=subsample_freq, + if subsample_frequency is not None: + settings['SubsampleFrequency'] = try_set( + obj=subsample_frequency, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if subsample is not None: - settings['Subsample'] = try_set( - obj=subsample, + if subsample_fraction is not None: + settings['SubsampleFraction'] = try_set( + obj=subsample_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ @@ -152,21 +147,16 @@ def dart( valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if reg_lambda is not None: - settings['RegLambda'] = try_set( - obj=reg_lambda, + if l2_regularization is not None: + settings['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if reg_alpha is not None: - settings['RegAlpha'] = try_set( - obj=reg_alpha, + if l1_regularization is not None: + settings['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if scale_pos_weight is not None: - settings['ScalePosWeight'] = try_set( - obj=scale_pos_weight, - none_acceptable=True, - is_of_type=numbers.Real) component = Component( name=entrypoint_name, diff --git a/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_gbdt.py b/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_gbdt.py index b795820d..714590be 100644 --- a/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_gbdt.py +++ b/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_gbdt.py @@ -10,91 +10,85 @@ def gbdt( - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): """ **Description** Traditional Gradient Boosting Decision Tree. - :param unbalanced_sets: Use for binary classification when - classes are not balanced. (settings). - :param min_split_gain: Minimum loss reduction required to make a - further partition on a leaf node of the tree. the larger, the - more conservative the algorithm will be. (settings). - :param max_depth: Maximum depth of a tree. 0 means no limit. - However, tree still grows by best-first. (settings). - :param min_child_weight: Minimum sum of instance weight(hessian) - needed in a child. If the tree partition step results in a - leaf node with the sum of instance weight less than - min_child_weight, then the building process will give up + :param minimum_split_gain: Minimum loss reduction required to + make a further partition on a leaf node of the tree. the + larger, the more conservative the algorithm will be. + (settings). + :param maximum_tree_depth: Maximum depth of a tree. 0 means no + limit. However, tree still grows by best-first. (settings). + :param minimum_child_weight: Minimum sum of instance + weight(hessian) needed in a child. If the tree partition step + results in a leaf node with the sum of instance weight less + than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. (settings). - :param subsample_freq: Subsample frequency. 0 means no subsample. - If subsampleFreq > 0, it will use a subset(ratio=subsample) - to train. And the subset will be updated on every Subsample - iteratinos. (settings). - :param subsample: Subsample ratio of the training instance. - Setting it to 0.5 means that LightGBM randomly collected half - of the data instances to grow trees and this will prevent - overfitting. Range: (0,1]. (settings). + :param subsample_frequency: Subsample frequency for bagging. 0 + means no subsample. Specifies the frequency at which the + bagging occurs, where if this is set to N, the subsampling + will happen at every N iterations.This must be set with + Subsample as this specifies the amount to subsample. + (settings). + :param subsample_fraction: Subsample ratio of the training + instance. Setting it to 0.5 means that LightGBM randomly + collected half of the data instances to grow trees and this + will prevent overfitting. Range: (0,1]. (settings). :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. (settings). - :param reg_lambda: L2 regularization term on weights, increasing - this value will make model more conservative. (settings). - :param reg_alpha: L1 regularization term on weights, increase - this value will make model more conservative. (settings). - :param scale_pos_weight: Control the balance of positive and - negative weights, useful for unbalanced classes. A typical - value to consider: sum(negative cases) / sum(positive cases). + :param l2_regularization: L2 regularization term on weights, + increasing this value will make model more conservative. + (settings). + :param l1_regularization: L1 regularization term on weights, + increase this value will make model more conservative. (settings). """ entrypoint_name = 'gbdt' settings = {} - if unbalanced_sets is not None: - settings['UnbalancedSets'] = try_set( - obj=unbalanced_sets, none_acceptable=True, is_of_type=bool) - if min_split_gain is not None: - settings['MinSplitGain'] = try_set( - obj=min_split_gain, + if minimum_split_gain is not None: + settings['MinimumSplitGain'] = try_set( + obj=minimum_split_gain, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if max_depth is not None: - settings['MaxDepth'] = try_set( - obj=max_depth, + if maximum_tree_depth is not None: + settings['MaximumTreeDepth'] = try_set( + obj=maximum_tree_depth, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if min_child_weight is not None: - settings['MinChildWeight'] = try_set( - obj=min_child_weight, + if minimum_child_weight is not None: + settings['MinimumChildWeight'] = try_set( + obj=minimum_child_weight, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if subsample_freq is not None: - settings['SubsampleFreq'] = try_set( - obj=subsample_freq, + if subsample_frequency is not None: + settings['SubsampleFrequency'] = try_set( + obj=subsample_frequency, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if subsample is not None: - settings['Subsample'] = try_set( - obj=subsample, + if subsample_fraction is not None: + settings['SubsampleFraction'] = try_set( + obj=subsample_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ @@ -108,21 +102,16 @@ def gbdt( valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if reg_lambda is not None: - settings['RegLambda'] = try_set( - obj=reg_lambda, + if l2_regularization is not None: + settings['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if reg_alpha is not None: - settings['RegAlpha'] = try_set( - obj=reg_alpha, + if l1_regularization is not None: + settings['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if scale_pos_weight is not None: - settings['ScalePosWeight'] = try_set( - obj=scale_pos_weight, - none_acceptable=True, - is_of_type=numbers.Real) component = Component( name=entrypoint_name, diff --git a/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_goss.py b/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_goss.py index ed407ae8..063febf1 100644 --- a/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_goss.py +++ b/src/python/nimbusml/internal/entrypoints/_boosterparameterfunction_goss.py @@ -12,16 +12,14 @@ def goss( top_rate=0.2, other_rate=0.1, - unbalanced_sets=False, - min_split_gain=0.0, - max_depth=0, - min_child_weight=0.1, - subsample_freq=0, - subsample=1.0, + minimum_split_gain=0.0, + maximum_tree_depth=0, + minimum_child_weight=0.1, + subsample_frequency=0, + subsample_fraction=1.0, feature_fraction=1.0, - reg_lambda=0.01, - reg_alpha=0.0, - scale_pos_weight=1.0, + l2_regularization=0.01, + l1_regularization=0.0, **params): """ **Description** @@ -31,38 +29,37 @@ def goss( (settings). :param other_rate: Retain ratio for small gradient instances. (settings). - :param unbalanced_sets: Use for binary classification when - classes are not balanced. (settings). - :param min_split_gain: Minimum loss reduction required to make a - further partition on a leaf node of the tree. the larger, the - more conservative the algorithm will be. (settings). - :param max_depth: Maximum depth of a tree. 0 means no limit. - However, tree still grows by best-first. (settings). - :param min_child_weight: Minimum sum of instance weight(hessian) - needed in a child. If the tree partition step results in a - leaf node with the sum of instance weight less than - min_child_weight, then the building process will give up + :param minimum_split_gain: Minimum loss reduction required to + make a further partition on a leaf node of the tree. the + larger, the more conservative the algorithm will be. + (settings). + :param maximum_tree_depth: Maximum depth of a tree. 0 means no + limit. However, tree still grows by best-first. (settings). + :param minimum_child_weight: Minimum sum of instance + weight(hessian) needed in a child. If the tree partition step + results in a leaf node with the sum of instance weight less + than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. (settings). - :param subsample_freq: Subsample frequency. 0 means no subsample. - If subsampleFreq > 0, it will use a subset(ratio=subsample) - to train. And the subset will be updated on every Subsample - iteratinos. (settings). - :param subsample: Subsample ratio of the training instance. - Setting it to 0.5 means that LightGBM randomly collected half - of the data instances to grow trees and this will prevent - overfitting. Range: (0,1]. (settings). + :param subsample_frequency: Subsample frequency for bagging. 0 + means no subsample. Specifies the frequency at which the + bagging occurs, where if this is set to N, the subsampling + will happen at every N iterations.This must be set with + Subsample as this specifies the amount to subsample. + (settings). + :param subsample_fraction: Subsample ratio of the training + instance. Setting it to 0.5 means that LightGBM randomly + collected half of the data instances to grow trees and this + will prevent overfitting. Range: (0,1]. (settings). :param feature_fraction: Subsample ratio of columns when constructing each tree. Range: (0,1]. (settings). - :param reg_lambda: L2 regularization term on weights, increasing - this value will make model more conservative. (settings). - :param reg_alpha: L1 regularization term on weights, increase - this value will make model more conservative. (settings). - :param scale_pos_weight: Control the balance of positive and - negative weights, useful for unbalanced classes. A typical - value to consider: sum(negative cases) / sum(positive cases). + :param l2_regularization: L2 regularization term on weights, + increasing this value will make model more conservative. + (settings). + :param l1_regularization: L1 regularization term on weights, + increase this value will make model more conservative. (settings). """ @@ -85,38 +82,35 @@ def goss( valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if unbalanced_sets is not None: - settings['UnbalancedSets'] = try_set( - obj=unbalanced_sets, none_acceptable=True, is_of_type=bool) - if min_split_gain is not None: - settings['MinSplitGain'] = try_set( - obj=min_split_gain, + if minimum_split_gain is not None: + settings['MinimumSplitGain'] = try_set( + obj=minimum_split_gain, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if max_depth is not None: - settings['MaxDepth'] = try_set( - obj=max_depth, + if maximum_tree_depth is not None: + settings['MaximumTreeDepth'] = try_set( + obj=maximum_tree_depth, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if min_child_weight is not None: - settings['MinChildWeight'] = try_set( - obj=min_child_weight, + if minimum_child_weight is not None: + settings['MinimumChildWeight'] = try_set( + obj=minimum_child_weight, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if subsample_freq is not None: - settings['SubsampleFreq'] = try_set( - obj=subsample_freq, + if subsample_frequency is not None: + settings['SubsampleFrequency'] = try_set( + obj=subsample_frequency, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Max': 2147483647, 'Min': 0}) - if subsample is not None: - settings['Subsample'] = try_set( - obj=subsample, + if subsample_fraction is not None: + settings['SubsampleFraction'] = try_set( + obj=subsample_fraction, none_acceptable=True, is_of_type=numbers.Real, valid_range={ @@ -130,21 +124,16 @@ def goss( valid_range={ 'Inf': 0.0, 'Max': 1.0}) - if reg_lambda is not None: - settings['RegLambda'] = try_set( - obj=reg_lambda, + if l2_regularization is not None: + settings['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if reg_alpha is not None: - settings['RegAlpha'] = try_set( - obj=reg_alpha, + if l1_regularization is not None: + settings['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if scale_pos_weight is not None: - settings['ScalePosWeight'] = try_set( - obj=scale_pos_weight, - none_acceptable=True, - is_of_type=numbers.Real) component = Component( name=entrypoint_name, diff --git a/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreebinaryclassification.py b/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreebinaryclassification.py index 3e30b55a..339c9318 100644 --- a/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreebinaryclassification.py +++ b/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreebinaryclassification.py @@ -11,24 +11,24 @@ def fast_tree_binary_classification( training_data, - num_trees=100, - num_leaves=20, - feature_column='Features', - min_documents_in_leafs=10, - label_column='Label', - learning_rates=0.2, - weight_column=None, - group_id_column=None, + number_of_trees=100, + number_of_leaves=20, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + learning_rate=0.2, + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', unbalanced_sets=False, best_step_ranking_regression_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimization_algorithm='GradientDescent', early_stopping_rule=None, - early_stopping_metrics=0, + early_stopping_metrics=1, enable_pruning=False, use_tolerant_pruning=False, pruning_threshold=0.004, @@ -37,43 +37,42 @@ def fast_tree_binary_classification( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - num_threads=None, - rng_seed=123, - feature_select_seed=123, + number_of_threads=None, + seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_for_categorical_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - max_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, feature_first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_confidence_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - bagging_train_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, print_test_graph=False, print_train_valid_graph=False, test_frequency=2147483647, @@ -83,35 +82,37 @@ def fast_tree_binary_classification( Uses a logit-boost boosted tree learner to perform binary classification. - :param num_trees: Total number of decision trees to create in the - ensemble (settings). + :param number_of_trees: Total number of decision trees to create + in the ensemble (settings). :param training_data: The data to be used for training (settings). - :param num_leaves: The max number of leaves in each regression - tree (settings). - :param feature_column: Column to use for features (settings). - :param min_documents_in_leafs: The minimal number of documents - allowed in a leaf of a regression tree, out of the subsampled - data (settings). - :param label_column: Column to use for labels (settings). - :param learning_rates: The learning rate (settings). - :param weight_column: Column to use for example weight + :param number_of_leaves: The max number of leaves in each + regression tree (settings). + :param feature_column_name: Column to use for features (settings). - :param group_id_column: Column to use for example groupId + :param minimum_example_count_per_leaf: The minimal number of + examples allowed in a leaf of a regression tree, out of the + subsampled data (settings). + :param label_column_name: Column to use for labels (settings). + :param learning_rate: The learning rate (settings). + :param example_weight_column_name: Column to use for example + weight (settings). + :param row_group_column_name: Column to use for example groupId (settings). :param normalize_features: Normalize option for the feature column (settings). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (settings). - :param unbalanced_sets: Should we use derivatives optimized for - unbalanced sets (settings). - :param best_step_ranking_regression_trees: Use best regression - step trees? (settings). + :param unbalanced_sets: Option for using derivatives optimized + for unbalanced sets (settings). + :param best_step_ranking_regression_trees: Option for using best + regression step trees (settings). :param use_line_search: Should we use line search for a step size (settings). - :param num_post_bracket_steps: Number of post-bracket line search - steps (settings). - :param min_step_size: Minimum line search step size (settings). + :param maximum_number_of_line_search_steps: Number of post- + bracket line search steps (settings). + :param minimum_step_size: Minimum line search step size + (settings). :param optimization_algorithm: Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) (settings). :param early_stopping_rule: Early stopping rule. (Validation set @@ -134,8 +135,8 @@ def fast_tree_binary_classification( times in the GetDerivatives function (settings). :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping (settings). - :param max_tree_output: Upper bound on absolute value of single - tree output (settings). + :param maximum_tree_output: Upper bound on absolute value of + single tree output (settings). :param random_start: Training starts from random ordering (determined by /r1) (settings). :param filter_zero_lambdas: Filter zero lambdas during training @@ -145,15 +146,15 @@ def fast_tree_binary_classification( :param baseline_alpha_risk: Baseline alpha for tradeoffs of risk (0 is normal training) (settings). :param position_discount_freeform: The discount freeform which - specifies the per position discounts of documents in a query + specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position) (settings). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm (settings). - :param num_threads: The number of threads to use (settings). - :param rng_seed: The seed of the random number generator + :param number_of_threads: The number of threads to use (settings). - :param feature_select_seed: The seed of the active feature + :param seed: The seed of the random number generator (settings). + :param feature_selection_seed: The seed of the active feature selection (settings). :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1 (settings). @@ -166,27 +167,28 @@ def fast_tree_binary_classification( dataset preparation to speed up training (settings). :param categorical_split: Whether to do split based on multiple categorical feature values. (settings). - :param max_categorical_groups_per_node: Maximum categorical split - groups to consider when splitting on a categorical feature. - Split groups are a collection of split points. This is used - to reduce overfitting when there many categorical features. - (settings). - :param max_categorical_split_points: Maximum categorical split - points to consider when splitting on a categorical feature. - (settings). - :param min_docs_percentage_for_categorical_split: Minimum - categorical docs percentage in a bin to consider for a split. + :param maximum_categorical_group_count_per_node: Maximum + categorical split groups to consider when splitting on a + categorical feature. Split groups are a collection of split + points. This is used to reduce overfitting when there many + categorical features. (settings). + :param maximum_categorical_split_point_count: Maximum categorical + split points to consider when splitting on a categorical + feature. (settings). + :param minimum_example_fraction_for_categorical_split: Minimum + categorical example percentage in a bin to consider for a + split. (settings). + :param minimum_examples_for_categorical_split: Minimum + categorical example count in a bin to consider for a split. (settings). - :param min_docs_for_categorical_split: Minimum categorical doc - count in a bin to consider for a split. (settings). :param bias: Bias for calculating gradient for each feature bin for a categorical feature. (settings). :param bundling: Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. (settings). - :param max_bins: Maximum number of distinct values (bins) per - feature (settings). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (settings). :param sparsify_threshold: Sparsity level needed to use sparse feature representation (settings). :param feature_first_use_penalty: The feature first use penalty @@ -197,16 +199,16 @@ def fast_tree_binary_classification( requirement (should be in the range [0,1) ). (settings). :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature (settings). - :param execution_times: Print execution time breakdown to stdout + :param execution_time: Print execution time breakdown to stdout (settings). :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration (settings). :param bagging_size: Number of trees in each bag (0 for disabling bagging) (settings). - :param bagging_train_fraction: Percentage of training examples + :param bagging_example_fraction: Percentage of training examples used in each bag (settings). - :param split_fraction: The fraction of features (chosen randomly) - to use on each split (settings). + :param feature_fraction_per_split: The fraction of features + (chosen randomly) to use on each split (settings). :param smoothing: Smoothing paramter for tree regularization (settings). :param allow_empty_trees: When a root split is impossible, allow @@ -214,8 +216,6 @@ def fast_tree_binary_classification( :param feature_compression_level: The level of feature compression to use (settings). :param compress_ensemble: Compress the tree Ensemble (settings). - :param max_trees_after_compression: Maximum Number of trees after - compression (settings). :param print_test_graph: Print metrics graph for the first test set (settings). :param print_train_valid_graph: Print Train and Validation @@ -227,50 +227,50 @@ def fast_tree_binary_classification( entrypoint_name = 'FastTreeBinaryClassification' settings = {} - if num_trees is not None: - settings['NumTrees'] = try_set( - obj=num_trees, + if number_of_trees is not None: + settings['NumberOfTrees'] = try_set( + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: settings['TrainingData'] = try_set( obj=training_data, none_acceptable=False, is_of_type=str) - if num_leaves is not None: - settings['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + settings['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - settings['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + settings['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents_in_leafs is not None: - settings['MinDocumentsInLeafs'] = try_set( - obj=min_documents_in_leafs, + if minimum_example_count_per_leaf is not None: + settings['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - settings['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + settings['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if learning_rates is not None: - settings['LearningRates'] = try_set( - obj=learning_rates, + if learning_rate is not None: + settings['LearningRate'] = try_set( + obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if weight_column is not None: - settings['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + settings['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - settings['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + settings['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -292,7 +292,6 @@ def fast_tree_binary_classification( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if unbalanced_sets is not None: settings['UnbalancedSets'] = try_set( @@ -305,14 +304,14 @@ def fast_tree_binary_classification( if use_line_search is not None: settings['UseLineSearch'] = try_set( obj=use_line_search, none_acceptable=True, is_of_type=bool) - if num_post_bracket_steps is not None: - settings['NumPostBracketSteps'] = try_set( - obj=num_post_bracket_steps, + if maximum_number_of_line_search_steps is not None: + settings['MaximumNumberOfLineSearchSteps'] = try_set( + obj=maximum_number_of_line_search_steps, none_acceptable=True, is_of_type=numbers.Real) - if min_step_size is not None: - settings['MinStepSize'] = try_set( - obj=min_step_size, + if minimum_step_size is not None: + settings['MinimumStepSize'] = try_set( + obj=minimum_step_size, none_acceptable=True, is_of_type=numbers.Real) if optimization_algorithm is not None: @@ -366,9 +365,9 @@ def fast_tree_binary_classification( if write_last_ensemble is not None: settings['WriteLastEnsemble'] = try_set( obj=write_last_ensemble, none_acceptable=True, is_of_type=bool) - if max_tree_output is not None: - settings['MaxTreeOutput'] = try_set( - obj=max_tree_output, + if maximum_tree_output is not None: + settings['MaximumTreeOutput'] = try_set( + obj=maximum_tree_output, none_acceptable=True, is_of_type=numbers.Real) if random_start is not None: @@ -391,19 +390,19 @@ def fast_tree_binary_classification( if parallel_trainer is not None: settings['ParallelTrainer'] = try_set( obj=parallel_trainer, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - settings['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + settings['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - settings['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + settings['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) - if feature_select_seed is not None: - settings['FeatureSelectSeed'] = try_set( - obj=feature_select_seed, + if feature_selection_seed is not None: + settings['FeatureSelectionSeed'] = try_set( + obj=feature_selection_seed, none_acceptable=True, is_of_type=numbers.Real) if entropy_coefficient is not None: @@ -425,24 +424,24 @@ def fast_tree_binary_classification( if categorical_split is not None: settings['CategoricalSplit'] = try_set( obj=categorical_split, none_acceptable=True, is_of_type=bool) - if max_categorical_groups_per_node is not None: - settings['MaxCategoricalGroupsPerNode'] = try_set( - obj=max_categorical_groups_per_node, + if maximum_categorical_group_count_per_node is not None: + settings['MaximumCategoricalGroupCountPerNode'] = try_set( + obj=maximum_categorical_group_count_per_node, none_acceptable=True, is_of_type=numbers.Real) - if max_categorical_split_points is not None: - settings['MaxCategoricalSplitPoints'] = try_set( - obj=max_categorical_split_points, + if maximum_categorical_split_point_count is not None: + settings['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_percentage_for_categorical_split is not None: - settings['MinDocsPercentageForCategoricalSplit'] = try_set( - obj=min_docs_percentage_for_categorical_split, + if minimum_example_fraction_for_categorical_split is not None: + settings['MinimumExampleFractionForCategoricalSplit'] = try_set( + obj=minimum_example_fraction_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_for_categorical_split is not None: - settings['MinDocsForCategoricalSplit'] = try_set( - obj=min_docs_for_categorical_split, + if minimum_examples_for_categorical_split is not None: + settings['MinimumExamplesForCategoricalSplit'] = try_set( + obj=minimum_examples_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) if bias is not None: @@ -454,9 +453,9 @@ def fast_tree_binary_classification( settings['Bundling'] = try_set( obj=bundling, none_acceptable=True, is_of_type=str, values=[ 'None', 'AggregateLowPopulation', 'Adjacent']) - if max_bins is not None: - settings['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + settings['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) if sparsify_threshold is not None: @@ -484,9 +483,9 @@ def fast_tree_binary_classification( obj=softmax_temperature, none_acceptable=True, is_of_type=numbers.Real) - if execution_times is not None: - settings['ExecutionTimes'] = try_set( - obj=execution_times, none_acceptable=True, is_of_type=bool) + if execution_time is not None: + settings['ExecutionTime'] = try_set( + obj=execution_time, none_acceptable=True, is_of_type=bool) if feature_fraction is not None: settings['FeatureFraction'] = try_set( obj=feature_fraction, @@ -497,14 +496,14 @@ def fast_tree_binary_classification( obj=bagging_size, none_acceptable=True, is_of_type=numbers.Real) - if bagging_train_fraction is not None: - settings['BaggingTrainFraction'] = try_set( - obj=bagging_train_fraction, + if bagging_example_fraction is not None: + settings['BaggingExampleFraction'] = try_set( + obj=bagging_example_fraction, none_acceptable=True, is_of_type=numbers.Real) - if split_fraction is not None: - settings['SplitFraction'] = try_set( - obj=split_fraction, + if feature_fraction_per_split is not None: + settings['FeatureFractionPerSplit'] = try_set( + obj=feature_fraction_per_split, none_acceptable=True, is_of_type=numbers.Real) if smoothing is not None: @@ -523,11 +522,6 @@ def fast_tree_binary_classification( if compress_ensemble is not None: settings['CompressEnsemble'] = try_set( obj=compress_ensemble, none_acceptable=True, is_of_type=bool) - if max_trees_after_compression is not None: - settings['MaxTreesAfterCompression'] = try_set( - obj=max_trees_after_compression, - none_acceptable=True, - is_of_type=numbers.Real) if print_test_graph is not None: settings['PrintTestGraph'] = try_set( obj=print_test_graph, none_acceptable=True, is_of_type=bool) diff --git a/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreeranking.py b/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreeranking.py index b59a9f82..4967b93b 100644 --- a/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreeranking.py +++ b/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreeranking.py @@ -11,7 +11,7 @@ def fast_tree_ranking( training_data, - num_trees=100, + number_of_trees=100, num_leaves=20, feature_column='Features', min_documents_in_leafs=10, @@ -90,7 +90,7 @@ def fast_tree_ranking( Trains gradient boosted decision trees to the LambdaRank quasi- gradient. - :param num_trees: Total number of decision trees to create in the + :param number_of_trees: Total number of decision trees to create in the ensemble (settings). :param training_data: The data to be used for training (settings). @@ -125,8 +125,8 @@ def fast_tree_ranking( (settings). :param normalize_query_lambdas: Normalize query lambdas (settings). - :param best_step_ranking_regression_trees: Use best regression - step trees? (settings). + :param best_step_ranking_regression_trees: Option for using best + regression step trees (settings). :param use_line_search: Should we use line search for a step size (settings). :param num_post_bracket_steps: Number of post-bracket line search @@ -247,9 +247,9 @@ def fast_tree_ranking( entrypoint_name = 'FastTreeRanking' settings = {} - if num_trees is not None: + if number_of_trees is not None: settings['NumTrees'] = try_set( - obj=num_trees, + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -312,7 +312,6 @@ def fast_tree_ranking( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if custom_gains is not None: settings['CustomGains'] = try_set( diff --git a/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreeregression.py b/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreeregression.py index e62389f1..26227b52 100644 --- a/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreeregression.py +++ b/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreeregression.py @@ -11,20 +11,20 @@ def fast_tree_regression( training_data, - num_trees=100, - num_leaves=20, - feature_column='Features', - min_documents_in_leafs=10, - label_column='Label', - learning_rates=0.2, - weight_column=None, - group_id_column=None, + number_of_trees=100, + number_of_leaves=20, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + learning_rate=0.2, + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', best_step_ranking_regression_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimization_algorithm='GradientDescent', early_stopping_rule=None, early_stopping_metrics=1, @@ -36,43 +36,42 @@ def fast_tree_regression( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - num_threads=None, - rng_seed=123, - feature_select_seed=123, + number_of_threads=None, + seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_for_categorical_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - max_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, feature_first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_confidence_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - bagging_train_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, print_test_graph=False, print_train_valid_graph=False, test_frequency=2147483647, @@ -82,33 +81,35 @@ def fast_tree_regression( Trains gradient boosted decision trees to fit target values using least-squares. - :param num_trees: Total number of decision trees to create in the - ensemble (settings). + :param number_of_trees: Total number of decision trees to create + in the ensemble (settings). :param training_data: The data to be used for training (settings). - :param num_leaves: The max number of leaves in each regression - tree (settings). - :param feature_column: Column to use for features (settings). - :param min_documents_in_leafs: The minimal number of documents - allowed in a leaf of a regression tree, out of the subsampled - data (settings). - :param label_column: Column to use for labels (settings). - :param learning_rates: The learning rate (settings). - :param weight_column: Column to use for example weight + :param number_of_leaves: The max number of leaves in each + regression tree (settings). + :param feature_column_name: Column to use for features (settings). - :param group_id_column: Column to use for example groupId + :param minimum_example_count_per_leaf: The minimal number of + examples allowed in a leaf of a regression tree, out of the + subsampled data (settings). + :param label_column_name: Column to use for labels (settings). + :param learning_rate: The learning rate (settings). + :param example_weight_column_name: Column to use for example + weight (settings). + :param row_group_column_name: Column to use for example groupId (settings). :param normalize_features: Normalize option for the feature column (settings). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (settings). - :param best_step_ranking_regression_trees: Use best regression - step trees? (settings). + :param best_step_ranking_regression_trees: Option for using best + regression step trees (settings). :param use_line_search: Should we use line search for a step size (settings). - :param num_post_bracket_steps: Number of post-bracket line search - steps (settings). - :param min_step_size: Minimum line search step size (settings). + :param maximum_number_of_line_search_steps: Number of post- + bracket line search steps (settings). + :param minimum_step_size: Minimum line search step size + (settings). :param optimization_algorithm: Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) (settings). :param early_stopping_rule: Early stopping rule. (Validation set @@ -131,8 +132,8 @@ def fast_tree_regression( times in the GetDerivatives function (settings). :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping (settings). - :param max_tree_output: Upper bound on absolute value of single - tree output (settings). + :param maximum_tree_output: Upper bound on absolute value of + single tree output (settings). :param random_start: Training starts from random ordering (determined by /r1) (settings). :param filter_zero_lambdas: Filter zero lambdas during training @@ -142,15 +143,15 @@ def fast_tree_regression( :param baseline_alpha_risk: Baseline alpha for tradeoffs of risk (0 is normal training) (settings). :param position_discount_freeform: The discount freeform which - specifies the per position discounts of documents in a query + specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position) (settings). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm (settings). - :param num_threads: The number of threads to use (settings). - :param rng_seed: The seed of the random number generator + :param number_of_threads: The number of threads to use (settings). - :param feature_select_seed: The seed of the active feature + :param seed: The seed of the random number generator (settings). + :param feature_selection_seed: The seed of the active feature selection (settings). :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1 (settings). @@ -163,27 +164,28 @@ def fast_tree_regression( dataset preparation to speed up training (settings). :param categorical_split: Whether to do split based on multiple categorical feature values. (settings). - :param max_categorical_groups_per_node: Maximum categorical split - groups to consider when splitting on a categorical feature. - Split groups are a collection of split points. This is used - to reduce overfitting when there many categorical features. - (settings). - :param max_categorical_split_points: Maximum categorical split - points to consider when splitting on a categorical feature. - (settings). - :param min_docs_percentage_for_categorical_split: Minimum - categorical docs percentage in a bin to consider for a split. + :param maximum_categorical_group_count_per_node: Maximum + categorical split groups to consider when splitting on a + categorical feature. Split groups are a collection of split + points. This is used to reduce overfitting when there many + categorical features. (settings). + :param maximum_categorical_split_point_count: Maximum categorical + split points to consider when splitting on a categorical + feature. (settings). + :param minimum_example_fraction_for_categorical_split: Minimum + categorical example percentage in a bin to consider for a + split. (settings). + :param minimum_examples_for_categorical_split: Minimum + categorical example count in a bin to consider for a split. (settings). - :param min_docs_for_categorical_split: Minimum categorical doc - count in a bin to consider for a split. (settings). :param bias: Bias for calculating gradient for each feature bin for a categorical feature. (settings). :param bundling: Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. (settings). - :param max_bins: Maximum number of distinct values (bins) per - feature (settings). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (settings). :param sparsify_threshold: Sparsity level needed to use sparse feature representation (settings). :param feature_first_use_penalty: The feature first use penalty @@ -194,16 +196,16 @@ def fast_tree_regression( requirement (should be in the range [0,1) ). (settings). :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature (settings). - :param execution_times: Print execution time breakdown to stdout + :param execution_time: Print execution time breakdown to stdout (settings). :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration (settings). :param bagging_size: Number of trees in each bag (0 for disabling bagging) (settings). - :param bagging_train_fraction: Percentage of training examples + :param bagging_example_fraction: Percentage of training examples used in each bag (settings). - :param split_fraction: The fraction of features (chosen randomly) - to use on each split (settings). + :param feature_fraction_per_split: The fraction of features + (chosen randomly) to use on each split (settings). :param smoothing: Smoothing paramter for tree regularization (settings). :param allow_empty_trees: When a root split is impossible, allow @@ -211,8 +213,6 @@ def fast_tree_regression( :param feature_compression_level: The level of feature compression to use (settings). :param compress_ensemble: Compress the tree Ensemble (settings). - :param max_trees_after_compression: Maximum Number of trees after - compression (settings). :param print_test_graph: Print metrics graph for the first test set (settings). :param print_train_valid_graph: Print Train and Validation @@ -224,50 +224,50 @@ def fast_tree_regression( entrypoint_name = 'FastTreeRegression' settings = {} - if num_trees is not None: - settings['NumTrees'] = try_set( - obj=num_trees, + if number_of_trees is not None: + settings['NumberOfTrees'] = try_set( + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: settings['TrainingData'] = try_set( obj=training_data, none_acceptable=False, is_of_type=str) - if num_leaves is not None: - settings['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + settings['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - settings['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + settings['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents_in_leafs is not None: - settings['MinDocumentsInLeafs'] = try_set( - obj=min_documents_in_leafs, + if minimum_example_count_per_leaf is not None: + settings['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - settings['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + settings['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if learning_rates is not None: - settings['LearningRates'] = try_set( - obj=learning_rates, + if learning_rate is not None: + settings['LearningRate'] = try_set( + obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if weight_column is not None: - settings['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + settings['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - settings['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + settings['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -289,7 +289,6 @@ def fast_tree_regression( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if best_step_ranking_regression_trees is not None: settings['BestStepRankingRegressionTrees'] = try_set( @@ -299,14 +298,14 @@ def fast_tree_regression( if use_line_search is not None: settings['UseLineSearch'] = try_set( obj=use_line_search, none_acceptable=True, is_of_type=bool) - if num_post_bracket_steps is not None: - settings['NumPostBracketSteps'] = try_set( - obj=num_post_bracket_steps, + if maximum_number_of_line_search_steps is not None: + settings['MaximumNumberOfLineSearchSteps'] = try_set( + obj=maximum_number_of_line_search_steps, none_acceptable=True, is_of_type=numbers.Real) - if min_step_size is not None: - settings['MinStepSize'] = try_set( - obj=min_step_size, + if minimum_step_size is not None: + settings['MinimumStepSize'] = try_set( + obj=minimum_step_size, none_acceptable=True, is_of_type=numbers.Real) if optimization_algorithm is not None: @@ -360,9 +359,9 @@ def fast_tree_regression( if write_last_ensemble is not None: settings['WriteLastEnsemble'] = try_set( obj=write_last_ensemble, none_acceptable=True, is_of_type=bool) - if max_tree_output is not None: - settings['MaxTreeOutput'] = try_set( - obj=max_tree_output, + if maximum_tree_output is not None: + settings['MaximumTreeOutput'] = try_set( + obj=maximum_tree_output, none_acceptable=True, is_of_type=numbers.Real) if random_start is not None: @@ -385,19 +384,19 @@ def fast_tree_regression( if parallel_trainer is not None: settings['ParallelTrainer'] = try_set( obj=parallel_trainer, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - settings['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + settings['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - settings['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + settings['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) - if feature_select_seed is not None: - settings['FeatureSelectSeed'] = try_set( - obj=feature_select_seed, + if feature_selection_seed is not None: + settings['FeatureSelectionSeed'] = try_set( + obj=feature_selection_seed, none_acceptable=True, is_of_type=numbers.Real) if entropy_coefficient is not None: @@ -419,24 +418,24 @@ def fast_tree_regression( if categorical_split is not None: settings['CategoricalSplit'] = try_set( obj=categorical_split, none_acceptable=True, is_of_type=bool) - if max_categorical_groups_per_node is not None: - settings['MaxCategoricalGroupsPerNode'] = try_set( - obj=max_categorical_groups_per_node, + if maximum_categorical_group_count_per_node is not None: + settings['MaximumCategoricalGroupCountPerNode'] = try_set( + obj=maximum_categorical_group_count_per_node, none_acceptable=True, is_of_type=numbers.Real) - if max_categorical_split_points is not None: - settings['MaxCategoricalSplitPoints'] = try_set( - obj=max_categorical_split_points, + if maximum_categorical_split_point_count is not None: + settings['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_percentage_for_categorical_split is not None: - settings['MinDocsPercentageForCategoricalSplit'] = try_set( - obj=min_docs_percentage_for_categorical_split, + if minimum_example_fraction_for_categorical_split is not None: + settings['MinimumExampleFractionForCategoricalSplit'] = try_set( + obj=minimum_example_fraction_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_for_categorical_split is not None: - settings['MinDocsForCategoricalSplit'] = try_set( - obj=min_docs_for_categorical_split, + if minimum_examples_for_categorical_split is not None: + settings['MinimumExamplesForCategoricalSplit'] = try_set( + obj=minimum_examples_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) if bias is not None: @@ -448,9 +447,9 @@ def fast_tree_regression( settings['Bundling'] = try_set( obj=bundling, none_acceptable=True, is_of_type=str, values=[ 'None', 'AggregateLowPopulation', 'Adjacent']) - if max_bins is not None: - settings['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + settings['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) if sparsify_threshold is not None: @@ -478,9 +477,9 @@ def fast_tree_regression( obj=softmax_temperature, none_acceptable=True, is_of_type=numbers.Real) - if execution_times is not None: - settings['ExecutionTimes'] = try_set( - obj=execution_times, none_acceptable=True, is_of_type=bool) + if execution_time is not None: + settings['ExecutionTime'] = try_set( + obj=execution_time, none_acceptable=True, is_of_type=bool) if feature_fraction is not None: settings['FeatureFraction'] = try_set( obj=feature_fraction, @@ -491,14 +490,14 @@ def fast_tree_regression( obj=bagging_size, none_acceptable=True, is_of_type=numbers.Real) - if bagging_train_fraction is not None: - settings['BaggingTrainFraction'] = try_set( - obj=bagging_train_fraction, + if bagging_example_fraction is not None: + settings['BaggingExampleFraction'] = try_set( + obj=bagging_example_fraction, none_acceptable=True, is_of_type=numbers.Real) - if split_fraction is not None: - settings['SplitFraction'] = try_set( - obj=split_fraction, + if feature_fraction_per_split is not None: + settings['FeatureFractionPerSplit'] = try_set( + obj=feature_fraction_per_split, none_acceptable=True, is_of_type=numbers.Real) if smoothing is not None: @@ -517,11 +516,6 @@ def fast_tree_regression( if compress_ensemble is not None: settings['CompressEnsemble'] = try_set( obj=compress_ensemble, none_acceptable=True, is_of_type=bool) - if max_trees_after_compression is not None: - settings['MaxTreesAfterCompression'] = try_set( - obj=max_trees_after_compression, - none_acceptable=True, - is_of_type=numbers.Real) if print_test_graph is not None: settings['PrintTestGraph'] = try_set( obj=print_test_graph, none_acceptable=True, is_of_type=bool) diff --git a/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreetweedieregression.py b/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreetweedieregression.py index 215b8952..0e96161c 100644 --- a/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreetweedieregression.py +++ b/src/python/nimbusml/internal/entrypoints/_fasttreetrainer_fasttreetweedieregression.py @@ -11,24 +11,24 @@ def fast_tree_tweedie_regression( training_data, - num_trees=100, - num_leaves=20, - feature_column='Features', - min_documents_in_leafs=10, - label_column='Label', - learning_rates=0.2, - weight_column=None, - group_id_column=None, + number_of_trees=100, + number_of_leaves=20, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + learning_rate=0.2, + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', index=1.5, best_step_ranking_regression_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimization_algorithm='GradientDescent', early_stopping_rule=None, - early_stopping_metrics=0, + early_stopping_metrics=1, enable_pruning=False, use_tolerant_pruning=False, pruning_threshold=0.004, @@ -37,43 +37,42 @@ def fast_tree_tweedie_regression( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - num_threads=None, - rng_seed=123, - feature_select_seed=123, + number_of_threads=None, + seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_for_categorical_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - max_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, feature_first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_confidence_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - bagging_train_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, print_test_graph=False, print_train_valid_graph=False, test_frequency=2147483647, @@ -84,36 +83,38 @@ def fast_tree_tweedie_regression( Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression. - :param num_trees: Total number of decision trees to create in the - ensemble (settings). + :param number_of_trees: Total number of decision trees to create + in the ensemble (settings). :param training_data: The data to be used for training (settings). - :param num_leaves: The max number of leaves in each regression - tree (settings). - :param feature_column: Column to use for features (settings). - :param min_documents_in_leafs: The minimal number of documents - allowed in a leaf of a regression tree, out of the subsampled - data (settings). - :param label_column: Column to use for labels (settings). - :param learning_rates: The learning rate (settings). - :param weight_column: Column to use for example weight + :param number_of_leaves: The max number of leaves in each + regression tree (settings). + :param feature_column_name: Column to use for features (settings). - :param group_id_column: Column to use for example groupId + :param minimum_example_count_per_leaf: The minimal number of + examples allowed in a leaf of a regression tree, out of the + subsampled data (settings). + :param label_column_name: Column to use for labels (settings). + :param learning_rate: The learning rate (settings). + :param example_weight_column_name: Column to use for example + weight (settings). + :param row_group_column_name: Column to use for example groupId (settings). :param normalize_features: Normalize option for the feature column (settings). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (settings). :param index: Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss. (settings). - :param best_step_ranking_regression_trees: Use best regression - step trees? (settings). + :param best_step_ranking_regression_trees: Option for using best + regression step trees (settings). :param use_line_search: Should we use line search for a step size (settings). - :param num_post_bracket_steps: Number of post-bracket line search - steps (settings). - :param min_step_size: Minimum line search step size (settings). + :param maximum_number_of_line_search_steps: Number of post- + bracket line search steps (settings). + :param minimum_step_size: Minimum line search step size + (settings). :param optimization_algorithm: Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) (settings). :param early_stopping_rule: Early stopping rule. (Validation set @@ -136,8 +137,8 @@ def fast_tree_tweedie_regression( times in the GetDerivatives function (settings). :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping (settings). - :param max_tree_output: Upper bound on absolute value of single - tree output (settings). + :param maximum_tree_output: Upper bound on absolute value of + single tree output (settings). :param random_start: Training starts from random ordering (determined by /r1) (settings). :param filter_zero_lambdas: Filter zero lambdas during training @@ -147,15 +148,15 @@ def fast_tree_tweedie_regression( :param baseline_alpha_risk: Baseline alpha for tradeoffs of risk (0 is normal training) (settings). :param position_discount_freeform: The discount freeform which - specifies the per position discounts of documents in a query + specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position) (settings). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm (settings). - :param num_threads: The number of threads to use (settings). - :param rng_seed: The seed of the random number generator + :param number_of_threads: The number of threads to use (settings). - :param feature_select_seed: The seed of the active feature + :param seed: The seed of the random number generator (settings). + :param feature_selection_seed: The seed of the active feature selection (settings). :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1 (settings). @@ -168,27 +169,28 @@ def fast_tree_tweedie_regression( dataset preparation to speed up training (settings). :param categorical_split: Whether to do split based on multiple categorical feature values. (settings). - :param max_categorical_groups_per_node: Maximum categorical split - groups to consider when splitting on a categorical feature. - Split groups are a collection of split points. This is used - to reduce overfitting when there many categorical features. - (settings). - :param max_categorical_split_points: Maximum categorical split - points to consider when splitting on a categorical feature. - (settings). - :param min_docs_percentage_for_categorical_split: Minimum - categorical docs percentage in a bin to consider for a split. + :param maximum_categorical_group_count_per_node: Maximum + categorical split groups to consider when splitting on a + categorical feature. Split groups are a collection of split + points. This is used to reduce overfitting when there many + categorical features. (settings). + :param maximum_categorical_split_point_count: Maximum categorical + split points to consider when splitting on a categorical + feature. (settings). + :param minimum_example_fraction_for_categorical_split: Minimum + categorical example percentage in a bin to consider for a + split. (settings). + :param minimum_examples_for_categorical_split: Minimum + categorical example count in a bin to consider for a split. (settings). - :param min_docs_for_categorical_split: Minimum categorical doc - count in a bin to consider for a split. (settings). :param bias: Bias for calculating gradient for each feature bin for a categorical feature. (settings). :param bundling: Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. (settings). - :param max_bins: Maximum number of distinct values (bins) per - feature (settings). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (settings). :param sparsify_threshold: Sparsity level needed to use sparse feature representation (settings). :param feature_first_use_penalty: The feature first use penalty @@ -199,16 +201,16 @@ def fast_tree_tweedie_regression( requirement (should be in the range [0,1) ). (settings). :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature (settings). - :param execution_times: Print execution time breakdown to stdout + :param execution_time: Print execution time breakdown to stdout (settings). :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration (settings). :param bagging_size: Number of trees in each bag (0 for disabling bagging) (settings). - :param bagging_train_fraction: Percentage of training examples + :param bagging_example_fraction: Percentage of training examples used in each bag (settings). - :param split_fraction: The fraction of features (chosen randomly) - to use on each split (settings). + :param feature_fraction_per_split: The fraction of features + (chosen randomly) to use on each split (settings). :param smoothing: Smoothing paramter for tree regularization (settings). :param allow_empty_trees: When a root split is impossible, allow @@ -216,8 +218,6 @@ def fast_tree_tweedie_regression( :param feature_compression_level: The level of feature compression to use (settings). :param compress_ensemble: Compress the tree Ensemble (settings). - :param max_trees_after_compression: Maximum Number of trees after - compression (settings). :param print_test_graph: Print metrics graph for the first test set (settings). :param print_train_valid_graph: Print Train and Validation @@ -229,50 +229,50 @@ def fast_tree_tweedie_regression( entrypoint_name = 'FastTreeTweedieRegression' settings = {} - if num_trees is not None: - settings['NumTrees'] = try_set( - obj=num_trees, + if number_of_trees is not None: + settings['NumberOfTrees'] = try_set( + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: settings['TrainingData'] = try_set( obj=training_data, none_acceptable=False, is_of_type=str) - if num_leaves is not None: - settings['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + settings['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - settings['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + settings['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents_in_leafs is not None: - settings['MinDocumentsInLeafs'] = try_set( - obj=min_documents_in_leafs, + if minimum_example_count_per_leaf is not None: + settings['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - settings['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + settings['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if learning_rates is not None: - settings['LearningRates'] = try_set( - obj=learning_rates, + if learning_rate is not None: + settings['LearningRate'] = try_set( + obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if weight_column is not None: - settings['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + settings['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - settings['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + settings['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -294,7 +294,6 @@ def fast_tree_tweedie_regression( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if index is not None: settings['Index'] = try_set( @@ -309,14 +308,14 @@ def fast_tree_tweedie_regression( if use_line_search is not None: settings['UseLineSearch'] = try_set( obj=use_line_search, none_acceptable=True, is_of_type=bool) - if num_post_bracket_steps is not None: - settings['NumPostBracketSteps'] = try_set( - obj=num_post_bracket_steps, + if maximum_number_of_line_search_steps is not None: + settings['MaximumNumberOfLineSearchSteps'] = try_set( + obj=maximum_number_of_line_search_steps, none_acceptable=True, is_of_type=numbers.Real) - if min_step_size is not None: - settings['MinStepSize'] = try_set( - obj=min_step_size, + if minimum_step_size is not None: + settings['MinimumStepSize'] = try_set( + obj=minimum_step_size, none_acceptable=True, is_of_type=numbers.Real) if optimization_algorithm is not None: @@ -370,9 +369,9 @@ def fast_tree_tweedie_regression( if write_last_ensemble is not None: settings['WriteLastEnsemble'] = try_set( obj=write_last_ensemble, none_acceptable=True, is_of_type=bool) - if max_tree_output is not None: - settings['MaxTreeOutput'] = try_set( - obj=max_tree_output, + if maximum_tree_output is not None: + settings['MaximumTreeOutput'] = try_set( + obj=maximum_tree_output, none_acceptable=True, is_of_type=numbers.Real) if random_start is not None: @@ -395,19 +394,19 @@ def fast_tree_tweedie_regression( if parallel_trainer is not None: settings['ParallelTrainer'] = try_set( obj=parallel_trainer, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - settings['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + settings['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - settings['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + settings['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) - if feature_select_seed is not None: - settings['FeatureSelectSeed'] = try_set( - obj=feature_select_seed, + if feature_selection_seed is not None: + settings['FeatureSelectionSeed'] = try_set( + obj=feature_selection_seed, none_acceptable=True, is_of_type=numbers.Real) if entropy_coefficient is not None: @@ -429,24 +428,24 @@ def fast_tree_tweedie_regression( if categorical_split is not None: settings['CategoricalSplit'] = try_set( obj=categorical_split, none_acceptable=True, is_of_type=bool) - if max_categorical_groups_per_node is not None: - settings['MaxCategoricalGroupsPerNode'] = try_set( - obj=max_categorical_groups_per_node, + if maximum_categorical_group_count_per_node is not None: + settings['MaximumCategoricalGroupCountPerNode'] = try_set( + obj=maximum_categorical_group_count_per_node, none_acceptable=True, is_of_type=numbers.Real) - if max_categorical_split_points is not None: - settings['MaxCategoricalSplitPoints'] = try_set( - obj=max_categorical_split_points, + if maximum_categorical_split_point_count is not None: + settings['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_percentage_for_categorical_split is not None: - settings['MinDocsPercentageForCategoricalSplit'] = try_set( - obj=min_docs_percentage_for_categorical_split, + if minimum_example_fraction_for_categorical_split is not None: + settings['MinimumExampleFractionForCategoricalSplit'] = try_set( + obj=minimum_example_fraction_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_for_categorical_split is not None: - settings['MinDocsForCategoricalSplit'] = try_set( - obj=min_docs_for_categorical_split, + if minimum_examples_for_categorical_split is not None: + settings['MinimumExamplesForCategoricalSplit'] = try_set( + obj=minimum_examples_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) if bias is not None: @@ -458,9 +457,9 @@ def fast_tree_tweedie_regression( settings['Bundling'] = try_set( obj=bundling, none_acceptable=True, is_of_type=str, values=[ 'None', 'AggregateLowPopulation', 'Adjacent']) - if max_bins is not None: - settings['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + settings['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) if sparsify_threshold is not None: @@ -488,9 +487,9 @@ def fast_tree_tweedie_regression( obj=softmax_temperature, none_acceptable=True, is_of_type=numbers.Real) - if execution_times is not None: - settings['ExecutionTimes'] = try_set( - obj=execution_times, none_acceptable=True, is_of_type=bool) + if execution_time is not None: + settings['ExecutionTime'] = try_set( + obj=execution_time, none_acceptable=True, is_of_type=bool) if feature_fraction is not None: settings['FeatureFraction'] = try_set( obj=feature_fraction, @@ -501,14 +500,14 @@ def fast_tree_tweedie_regression( obj=bagging_size, none_acceptable=True, is_of_type=numbers.Real) - if bagging_train_fraction is not None: - settings['BaggingTrainFraction'] = try_set( - obj=bagging_train_fraction, + if bagging_example_fraction is not None: + settings['BaggingExampleFraction'] = try_set( + obj=bagging_example_fraction, none_acceptable=True, is_of_type=numbers.Real) - if split_fraction is not None: - settings['SplitFraction'] = try_set( - obj=split_fraction, + if feature_fraction_per_split is not None: + settings['FeatureFractionPerSplit'] = try_set( + obj=feature_fraction_per_split, none_acceptable=True, is_of_type=numbers.Real) if smoothing is not None: @@ -527,11 +526,6 @@ def fast_tree_tweedie_regression( if compress_ensemble is not None: settings['CompressEnsemble'] = try_set( obj=compress_ensemble, none_acceptable=True, is_of_type=bool) - if max_trees_after_compression is not None: - settings['MaxTreesAfterCompression'] = try_set( - obj=max_trees_after_compression, - none_acceptable=True, - is_of_type=numbers.Real) if print_test_graph is not None: settings['PrintTestGraph'] = try_set( obj=print_test_graph, none_acceptable=True, is_of_type=bool) diff --git a/src/python/nimbusml/internal/entrypoints/_ngramextractor_ngram.py b/src/python/nimbusml/internal/entrypoints/_ngramextractor_ngram.py index cf72652c..eb746746 100644 --- a/src/python/nimbusml/internal/entrypoints/_ngramextractor_ngram.py +++ b/src/python/nimbusml/internal/entrypoints/_ngramextractor_ngram.py @@ -23,10 +23,10 @@ def n_gram( :param ngram_length: Ngram length (settings). :param skip_length: Maximum number of tokens to skip when - constructing an ngram (settings). - :param all_lengths: Whether to include all ngram lengths up to + constructing an n-gram (settings). + :param all_lengths: Whether to include all n-gram lengths up to NgramLength or only NgramLength (settings). - :param max_num_terms: Maximum number of ngrams to store in the + :param max_num_terms: Maximum number of n-grams to store in the dictionary (settings). :param weighting: The weighting criteria (settings). """ diff --git a/src/python/nimbusml/internal/entrypoints/_ngramextractor_ngramhash.py b/src/python/nimbusml/internal/entrypoints/_ngramextractor_ngramhash.py index 2fae7293..dbc8bc4d 100644 --- a/src/python/nimbusml/internal/entrypoints/_ngramextractor_ngramhash.py +++ b/src/python/nimbusml/internal/entrypoints/_ngramextractor_ngramhash.py @@ -10,41 +10,41 @@ def n_gram_hash( - hash_bits=16, + number_of_bits=16, ngram_length=1, skip_length=0, all_lengths=True, seed=314489979, ordered=True, - invert_hash=0, + maximum_number_of_inverts=0, **params): """ **Description** Extracts NGrams from text and convert them to vector using hashing trick. - :param hash_bits: Number of bits to hash into. Must be between 1 - and 30, inclusive. (settings). + :param number_of_bits: Number of bits to hash into. Must be + between 1 and 30, inclusive. (settings). :param ngram_length: Ngram length (settings). :param skip_length: Maximum number of tokens to skip when - constructing an ngram (settings). - :param all_lengths: Whether to include all ngram lengths up to + constructing an n-gram (settings). + :param all_lengths: Whether to include all n-gram lengths up to ngramLength or only ngramLength (settings). :param seed: Hashing seed (settings). :param ordered: Whether the position of each source column should be included in the hash (when there are multiple source columns). (settings). - :param invert_hash: Limit the number of keys used to generate the - slot name to this many. 0 means no invert hashing, -1 means - no limit. (settings). + :param maximum_number_of_inverts: Limit the number of keys used + to generate the slot name to this many. 0 means no invert + hashing, -1 means no limit. (settings). """ entrypoint_name = 'NGramHash' settings = {} - if hash_bits is not None: - settings['HashBits'] = try_set( - obj=hash_bits, + if number_of_bits is not None: + settings['NumberOfBits'] = try_set( + obj=number_of_bits, none_acceptable=True, is_of_type=numbers.Real) if ngram_length is not None: @@ -70,9 +70,9 @@ def n_gram_hash( if ordered is not None: settings['Ordered'] = try_set( obj=ordered, none_acceptable=True, is_of_type=bool) - if invert_hash is not None: - settings['InvertHash'] = try_set( - obj=invert_hash, + if maximum_number_of_inverts is not None: + settings['MaximumNumberOfInverts'] = try_set( + obj=maximum_number_of_inverts, none_acceptable=True, is_of_type=numbers.Real) diff --git a/src/python/nimbusml/internal/entrypoints/data_predictormodelarrayconverter.py b/src/python/nimbusml/internal/entrypoints/data_predictormodelarrayconverter.py index 62e5dbb0..af282b05 100644 --- a/src/python/nimbusml/internal/entrypoints/data_predictormodelarrayconverter.py +++ b/src/python/nimbusml/internal/entrypoints/data_predictormodelarrayconverter.py @@ -9,29 +9,29 @@ def data_predictormodelarrayconverter( - model, - output_model, + models, + output_models, **params): """ **Description** Create an array variable of PredictorModel - :param model: The models (inputs). - :param output_model: The model array (outputs). + :param models: The models (inputs). + :param output_models: The model array (outputs). """ entrypoint_name = 'Data.PredictorModelArrayConverter' inputs = {} outputs = {} - if model is not None: - inputs['Model'] = try_set( - obj=model, + if models is not None: + inputs['Models'] = try_set( + obj=models, none_acceptable=False, is_of_type=list) - if output_model is not None: - outputs['OutputModel'] = try_set( - obj=output_model, + if output_models is not None: + outputs['OutputModels'] = try_set( + obj=output_models, none_acceptable=False, is_of_type=list) diff --git a/src/python/nimbusml/internal/entrypoints/data_textloader.py b/src/python/nimbusml/internal/entrypoints/data_textloader.py index e53f4434..1d1db853 100644 --- a/src/python/nimbusml/internal/entrypoints/data_textloader.py +++ b/src/python/nimbusml/internal/entrypoints/data_textloader.py @@ -38,15 +38,15 @@ def data_textloader( is_of_type=dict, field_names=[ 'Column', - 'UseThreads', - 'HeaderFile', - 'MaxRows', 'AllowQuoting', 'AllowSparse', 'InputSize', 'Separator', 'TrimWhitespace', - 'HasHeader']) + 'HasHeader', + 'UseThreads', + 'HeaderFile', + 'MaxRows']) if data is not None: outputs['Data'] = try_set( obj=data, diff --git a/src/python/nimbusml/internal/entrypoints/models_crossvalidationresultscombiner.py b/src/python/nimbusml/internal/entrypoints/models_crossvalidationresultscombiner.py index 4af57dc7..7af1b398 100644 --- a/src/python/nimbusml/internal/entrypoints/models_crossvalidationresultscombiner.py +++ b/src/python/nimbusml/internal/entrypoints/models_crossvalidationresultscombiner.py @@ -19,8 +19,8 @@ def models_crossvalidationresultscombiner( warnings=None, kind='SignatureBinaryClassifierTrainer', label_column='Label', - weight_column=None, - group_column=None, + weight_column='Weight', + group_column='GroupId', name_column='Name', **params): """ @@ -76,7 +76,7 @@ def models_crossvalidationresultscombiner( is_of_type=str, values=[ 'SignatureBinaryClassifierTrainer', - 'SignatureMultiClassClassifierTrainer', + 'SignatureMulticlassClassificationTrainer', 'SignatureRankerTrainer', 'SignatureRegressorTrainer', 'SignatureMultiOutputRegressorTrainer', diff --git a/src/python/nimbusml/internal/entrypoints/models_crossvalidator.py b/src/python/nimbusml/internal/entrypoints/models_crossvalidator.py index 4222751d..e3fe3873 100644 --- a/src/python/nimbusml/internal/entrypoints/models_crossvalidator.py +++ b/src/python/nimbusml/internal/entrypoints/models_crossvalidator.py @@ -24,8 +24,8 @@ def models_crossvalidator( num_folds=2, kind='SignatureBinaryClassifierTrainer', label_column='Label', - weight_column=None, - group_column=None, + weight_column='Weight', + group_column='GroupId', name_column='Name', **params): """ @@ -108,7 +108,7 @@ def models_crossvalidator( is_of_type=str, values=[ 'SignatureBinaryClassifierTrainer', - 'SignatureMultiClassClassifierTrainer', + 'SignatureMulticlassClassificationTrainer', 'SignatureRankerTrainer', 'SignatureRegressorTrainer', 'SignatureMultiOutputRegressorTrainer', diff --git a/src/python/nimbusml/internal/entrypoints/models_oneversusall.py b/src/python/nimbusml/internal/entrypoints/models_oneversusall.py index ee26388e..ec8a2db1 100644 --- a/src/python/nimbusml/internal/entrypoints/models_oneversusall.py +++ b/src/python/nimbusml/internal/entrypoints/models_oneversusall.py @@ -13,10 +13,10 @@ def models_oneversusall( training_data, output_for_sub_graph=0, predictor_model=None, - feature_column='Features', + feature_column_name='Features', use_probabilities=True, - label_column='Label', - weight_column=None, + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', **params): @@ -30,14 +30,15 @@ def models_oneversusall( :param training_data: The data to be used for training (inputs). :param output_for_sub_graph: The training subgraph output. (inputs). - :param feature_column: Column to use for features (inputs). + :param feature_column_name: Column to use for features (inputs). :param use_probabilities: Use probabilities in OVA combiner (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param predictor_model: The trained multiclass model (outputs). """ @@ -62,9 +63,9 @@ def models_oneversusall( none_acceptable=False, is_of_type=dict, field_names=['Model']) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -73,15 +74,15 @@ def models_oneversusall( obj=use_probabilities, none_acceptable=True, is_of_type=bool) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -103,7 +104,6 @@ def models_oneversusall( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if predictor_model is not None: outputs['PredictorModel'] = try_set( diff --git a/src/python/nimbusml/internal/entrypoints/models_ovamodelcombiner.py b/src/python/nimbusml/internal/entrypoints/models_ovamodelcombiner.py index dcf4b856..3acbe614 100644 --- a/src/python/nimbusml/internal/entrypoints/models_ovamodelcombiner.py +++ b/src/python/nimbusml/internal/entrypoints/models_ovamodelcombiner.py @@ -13,9 +13,9 @@ def models_ovamodelcombiner( predictor_model=None, model_array=None, use_probabilities=True, - feature_column='Features', - label_column='Label', - weight_column=None, + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', **params): @@ -27,12 +27,13 @@ def models_ovamodelcombiner( :param training_data: The data to be used for training (inputs). :param use_probabilities: Use probabilities from learners instead of raw values. (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param predictor_model: Predictor model (outputs). """ @@ -56,21 +57,21 @@ def models_ovamodelcombiner( obj=use_probabilities, none_acceptable=True, is_of_type=bool) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -92,7 +93,6 @@ def models_ovamodelcombiner( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if predictor_model is not None: outputs['PredictorModel'] = try_set( diff --git a/src/python/nimbusml/internal/entrypoints/models_rankerevaluator.py b/src/python/nimbusml/internal/entrypoints/models_rankingevaluator.py similarity index 97% rename from src/python/nimbusml/internal/entrypoints/models_rankerevaluator.py rename to src/python/nimbusml/internal/entrypoints/models_rankingevaluator.py index 79d7313b..d82dc772 100644 --- a/src/python/nimbusml/internal/entrypoints/models_rankerevaluator.py +++ b/src/python/nimbusml/internal/entrypoints/models_rankingevaluator.py @@ -1,6 +1,6 @@ # - Generated by tools/entrypoint_compiler.py: do not edit by hand """ -Models.RankerEvaluator +Models.RankingEvaluator """ import numbers @@ -9,7 +9,7 @@ from ..utils.utils import try_set, unlist -def models_rankerevaluator( +def models_rankingevaluator( data, warnings=None, overall_metrics=None, @@ -43,7 +43,7 @@ def models_rankerevaluator( (outputs). """ - entrypoint_name = 'Models.RankerEvaluator' + entrypoint_name = 'Models.RankingEvaluator' inputs = {} outputs = {} diff --git a/src/python/nimbusml/internal/entrypoints/models_traintestevaluator.py b/src/python/nimbusml/internal/entrypoints/models_traintestevaluator.py index d4ac0ab2..68dd0a43 100644 --- a/src/python/nimbusml/internal/entrypoints/models_traintestevaluator.py +++ b/src/python/nimbusml/internal/entrypoints/models_traintestevaluator.py @@ -28,8 +28,8 @@ def models_traintestevaluator( pipeline_id=None, include_training_metrics=False, label_column='Label', - weight_column=None, - group_column=None, + weight_column='Weight', + group_column='GroupId', name_column='Name', **params): """ @@ -115,7 +115,7 @@ def models_traintestevaluator( is_of_type=str, values=[ 'SignatureBinaryClassifierTrainer', - 'SignatureMultiClassClassifierTrainer', + 'SignatureMulticlassClassificationTrainer', 'SignatureRankerTrainer', 'SignatureRegressorTrainer', 'SignatureMultiOutputRegressorTrainer', diff --git a/src/python/nimbusml/internal/entrypoints/trainers_averagedperceptronbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_averagedperceptronbinaryclassifier.py index d74fac15..6db6aab4 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_averagedperceptronbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_averagedperceptronbinaryclassifier.py @@ -12,57 +12,56 @@ def trainers_averagedperceptronbinaryclassifier( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', + feature_column_name='Features', + label_column_name='Label', normalize_features='Auto', caching='Auto', loss_function=None, learning_rate=1.0, decrease_learning_rate=False, - l2_regularizer_weight=0.0, - num_iterations=1, - init_wts_diameter=0.0, + l2_regularization=0.0, + number_of_iterations=1, + initial_weights_diameter=0.0, calibrator=None, max_calibration_examples=1000000, reset_weights_after_x_examples=None, - do_lazy_updates=True, + lazy_update=True, recency_gain=0.0, - recency_gain_multi=False, + recency_gain_multiplicative=False, averaged=True, averaged_tolerance=0.01, initial_weights=None, shuffle=True, - streaming_cache_size=1000000, **params): """ **Description** Averaged Perceptron Binary Classifier. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param loss_function: Loss Function (inputs). :param learning_rate: Learning rate (inputs). :param decrease_learning_rate: Decrease learning rate (inputs). - :param l2_regularizer_weight: L2 Regularization Weight (inputs). - :param num_iterations: Number of iterations (inputs). - :param init_wts_diameter: Init weights diameter (inputs). + :param l2_regularization: L2 Regularization Weight (inputs). + :param number_of_iterations: Number of iterations (inputs). + :param initial_weights_diameter: Init weights diameter (inputs). :param calibrator: The calibrator kind to apply to the predictor. Specify null for no calibration (inputs). :param max_calibration_examples: The maximum number of examples to use when training the calibrator (inputs). :param reset_weights_after_x_examples: Number of examples after which weights will be reset to the current average (inputs). - :param do_lazy_updates: Instead of updating averaged weights on - every example, only update when loss is nonzero (inputs). + :param lazy_update: Instead of updating averaged weights on every + example, only update when loss is nonzero (inputs). :param recency_gain: Extra weight given to more recent updates (inputs). - :param recency_gain_multi: Whether Recency Gain is multiplicative - (vs. additive) (inputs). + :param recency_gain_multiplicative: Whether Recency Gain is + multiplicative (vs. additive) (inputs). :param averaged: Do averaging? (inputs). :param averaged_tolerance: The inexactness tolerance for averaging (inputs). @@ -70,8 +69,6 @@ def trainers_averagedperceptronbinaryclassifier( (inputs). :param shuffle: Whether to shuffle for each training iteration (inputs). - :param streaming_cache_size: Size of cache when trained in Scope - (inputs). :param predictor_model: The trained model (outputs). """ @@ -84,15 +81,15 @@ def trainers_averagedperceptronbinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -114,7 +111,6 @@ def trainers_averagedperceptronbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if loss_function is not None: inputs['LossFunction'] = try_set( @@ -129,19 +125,19 @@ def trainers_averagedperceptronbinaryclassifier( if decrease_learning_rate is not None: inputs['DecreaseLearningRate'] = try_set( obj=decrease_learning_rate, none_acceptable=True, is_of_type=bool) - if l2_regularizer_weight is not None: - inputs['L2RegularizerWeight'] = try_set( - obj=l2_regularizer_weight, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) - if num_iterations is not None: - inputs['NumIterations'] = try_set( - obj=num_iterations, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) - if init_wts_diameter is not None: - inputs['InitWtsDiameter'] = try_set( - obj=init_wts_diameter, + if initial_weights_diameter is not None: + inputs['InitialWeightsDiameter'] = try_set( + obj=initial_weights_diameter, none_acceptable=True, is_of_type=numbers.Real) if calibrator is not None: @@ -159,9 +155,9 @@ def trainers_averagedperceptronbinaryclassifier( obj=reset_weights_after_x_examples, none_acceptable=True, is_of_type=numbers.Real) - if do_lazy_updates is not None: - inputs['DoLazyUpdates'] = try_set( - obj=do_lazy_updates, + if lazy_update is not None: + inputs['LazyUpdate'] = try_set( + obj=lazy_update, none_acceptable=True, is_of_type=bool) if recency_gain is not None: @@ -169,9 +165,9 @@ def trainers_averagedperceptronbinaryclassifier( obj=recency_gain, none_acceptable=True, is_of_type=numbers.Real) - if recency_gain_multi is not None: - inputs['RecencyGainMulti'] = try_set( - obj=recency_gain_multi, + if recency_gain_multiplicative is not None: + inputs['RecencyGainMultiplicative'] = try_set( + obj=recency_gain_multiplicative, none_acceptable=True, is_of_type=bool) if averaged is not None: @@ -194,11 +190,6 @@ def trainers_averagedperceptronbinaryclassifier( obj=shuffle, none_acceptable=True, is_of_type=bool) - if streaming_cache_size is not None: - inputs['StreamingCacheSize'] = try_set( - obj=streaming_cache_size, - none_acceptable=True, - is_of_type=numbers.Real) if predictor_model is not None: outputs['PredictorModel'] = try_set( obj=predictor_model, none_acceptable=False, is_of_type=str) diff --git a/src/python/nimbusml/internal/entrypoints/trainers_fastforestbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_fastforestbinaryclassifier.py index d4fc432f..bf83a135 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_fastforestbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_fastforestbinaryclassifier.py @@ -12,50 +12,49 @@ def trainers_fastforestbinaryclassifier( training_data, predictor_model=None, - num_trees=100, - num_leaves=20, - feature_column='Features', - min_documents_in_leafs=10, - label_column='Label', - weight_column=None, - group_id_column=None, + number_of_trees=100, + number_of_leaves=20, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', - max_tree_output=100.0, + maximum_output_magnitude_per_tree=100.0, calibrator=None, max_calibration_examples=1000000, - quantile_sample_count=100, + number_of_quantile_samples=100, parallel_trainer=None, - num_threads=None, - rng_seed=123, - feature_select_seed=123, + number_of_threads=None, + seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_for_categorical_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - max_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, feature_first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_confidence_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=0.7, bagging_size=1, - bagging_train_fraction=0.7, - split_fraction=0.7, + bagging_example_fraction=0.7, + feature_fraction_per_split=0.7, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, print_test_graph=False, print_train_valid_graph=False, test_frequency=2147483647, @@ -64,37 +63,37 @@ def trainers_fastforestbinaryclassifier( **Description** Uses a random forest learner to perform binary classification. - :param num_trees: Total number of decision trees to create in the - ensemble (inputs). + :param number_of_trees: Total number of decision trees to create + in the ensemble (inputs). :param training_data: The data to be used for training (inputs). - :param num_leaves: The max number of leaves in each regression - tree (inputs). - :param feature_column: Column to use for features (inputs). - :param min_documents_in_leafs: The minimal number of documents - allowed in a leaf of a regression tree, out of the subsampled - data (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param number_of_leaves: The max number of leaves in each + regression tree (inputs). + :param feature_column_name: Column to use for features (inputs). + :param minimum_example_count_per_leaf: The minimal number of + examples allowed in a leaf of a regression tree, out of the + subsampled data (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param max_tree_output: Upper bound on absolute value of single - tree output (inputs). + :param maximum_output_magnitude_per_tree: Upper bound on absolute + value of single tree output (inputs). :param calibrator: The calibrator kind to apply to the predictor. Specify null for no calibration (inputs). :param max_calibration_examples: The maximum number of examples to use when training the calibrator (inputs). - :param quantile_sample_count: Number of labels to be sampled from - each leaf to make the distribtuion (inputs). + :param number_of_quantile_samples: Number of labels to be sampled + from each leaf to make the distribution (inputs). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm (inputs). - :param num_threads: The number of threads to use (inputs). - :param rng_seed: The seed of the random number generator - (inputs). - :param feature_select_seed: The seed of the active feature + :param number_of_threads: The number of threads to use (inputs). + :param seed: The seed of the random number generator (inputs). + :param feature_selection_seed: The seed of the active feature selection (inputs). :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1 (inputs). @@ -107,27 +106,28 @@ def trainers_fastforestbinaryclassifier( dataset preparation to speed up training (inputs). :param categorical_split: Whether to do split based on multiple categorical feature values. (inputs). - :param max_categorical_groups_per_node: Maximum categorical split - groups to consider when splitting on a categorical feature. - Split groups are a collection of split points. This is used - to reduce overfitting when there many categorical features. - (inputs). - :param max_categorical_split_points: Maximum categorical split - points to consider when splitting on a categorical feature. - (inputs). - :param min_docs_percentage_for_categorical_split: Minimum - categorical docs percentage in a bin to consider for a split. + :param maximum_categorical_group_count_per_node: Maximum + categorical split groups to consider when splitting on a + categorical feature. Split groups are a collection of split + points. This is used to reduce overfitting when there many + categorical features. (inputs). + :param maximum_categorical_split_point_count: Maximum categorical + split points to consider when splitting on a categorical + feature. (inputs). + :param minimum_example_fraction_for_categorical_split: Minimum + categorical example percentage in a bin to consider for a + split. (inputs). + :param minimum_examples_for_categorical_split: Minimum + categorical example count in a bin to consider for a split. (inputs). - :param min_docs_for_categorical_split: Minimum categorical doc - count in a bin to consider for a split. (inputs). :param bias: Bias for calculating gradient for each feature bin for a categorical feature. (inputs). :param bundling: Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. (inputs). - :param max_bins: Maximum number of distinct values (bins) per - feature (inputs). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (inputs). :param sparsify_threshold: Sparsity level needed to use sparse feature representation (inputs). :param feature_first_use_penalty: The feature first use penalty @@ -138,16 +138,16 @@ def trainers_fastforestbinaryclassifier( requirement (should be in the range [0,1) ). (inputs). :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature (inputs). - :param execution_times: Print execution time breakdown to stdout + :param execution_time: Print execution time breakdown to stdout (inputs). :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration (inputs). :param bagging_size: Number of trees in each bag (0 for disabling bagging) (inputs). - :param bagging_train_fraction: Percentage of training examples + :param bagging_example_fraction: Percentage of training examples used in each bag (inputs). - :param split_fraction: The fraction of features (chosen randomly) - to use on each split (inputs). + :param feature_fraction_per_split: The fraction of features + (chosen randomly) to use on each split (inputs). :param smoothing: Smoothing paramter for tree regularization (inputs). :param allow_empty_trees: When a root split is impossible, allow @@ -155,8 +155,6 @@ def trainers_fastforestbinaryclassifier( :param feature_compression_level: The level of feature compression to use (inputs). :param compress_ensemble: Compress the tree Ensemble (inputs). - :param max_trees_after_compression: Maximum Number of trees after - compression (inputs). :param print_test_graph: Print metrics graph for the first test set (inputs). :param print_train_valid_graph: Print Train and Validation @@ -170,9 +168,9 @@ def trainers_fastforestbinaryclassifier( inputs = {} outputs = {} - if num_trees is not None: - inputs['NumTrees'] = try_set( - obj=num_trees, + if number_of_trees is not None: + inputs['NumberOfTrees'] = try_set( + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -180,37 +178,37 @@ def trainers_fastforestbinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents_in_leafs is not None: - inputs['MinDocumentsInLeafs'] = try_set( - obj=min_documents_in_leafs, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -232,11 +230,10 @@ def trainers_fastforestbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if max_tree_output is not None: - inputs['MaxTreeOutput'] = try_set( - obj=max_tree_output, + if maximum_output_magnitude_per_tree is not None: + inputs['MaximumOutputMagnitudePerTree'] = try_set( + obj=maximum_output_magnitude_per_tree, none_acceptable=True, is_of_type=numbers.Real) if calibrator is not None: @@ -249,9 +246,9 @@ def trainers_fastforestbinaryclassifier( obj=max_calibration_examples, none_acceptable=True, is_of_type=numbers.Real) - if quantile_sample_count is not None: - inputs['QuantileSampleCount'] = try_set( - obj=quantile_sample_count, + if number_of_quantile_samples is not None: + inputs['NumberOfQuantileSamples'] = try_set( + obj=number_of_quantile_samples, none_acceptable=True, is_of_type=numbers.Real) if parallel_trainer is not None: @@ -259,19 +256,19 @@ def trainers_fastforestbinaryclassifier( obj=parallel_trainer, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - inputs['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) - if feature_select_seed is not None: - inputs['FeatureSelectSeed'] = try_set( - obj=feature_select_seed, + if feature_selection_seed is not None: + inputs['FeatureSelectionSeed'] = try_set( + obj=feature_selection_seed, none_acceptable=True, is_of_type=numbers.Real) if entropy_coefficient is not None: @@ -299,24 +296,24 @@ def trainers_fastforestbinaryclassifier( obj=categorical_split, none_acceptable=True, is_of_type=bool) - if max_categorical_groups_per_node is not None: - inputs['MaxCategoricalGroupsPerNode'] = try_set( - obj=max_categorical_groups_per_node, + if maximum_categorical_group_count_per_node is not None: + inputs['MaximumCategoricalGroupCountPerNode'] = try_set( + obj=maximum_categorical_group_count_per_node, none_acceptable=True, is_of_type=numbers.Real) - if max_categorical_split_points is not None: - inputs['MaxCategoricalSplitPoints'] = try_set( - obj=max_categorical_split_points, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_percentage_for_categorical_split is not None: - inputs['MinDocsPercentageForCategoricalSplit'] = try_set( - obj=min_docs_percentage_for_categorical_split, + if minimum_example_fraction_for_categorical_split is not None: + inputs['MinimumExampleFractionForCategoricalSplit'] = try_set( + obj=minimum_example_fraction_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_for_categorical_split is not None: - inputs['MinDocsForCategoricalSplit'] = try_set( - obj=min_docs_for_categorical_split, + if minimum_examples_for_categorical_split is not None: + inputs['MinimumExamplesForCategoricalSplit'] = try_set( + obj=minimum_examples_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) if bias is not None: @@ -333,9 +330,9 @@ def trainers_fastforestbinaryclassifier( 'None', 'AggregateLowPopulation', 'Adjacent']) - if max_bins is not None: - inputs['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) if sparsify_threshold is not None: @@ -363,9 +360,9 @@ def trainers_fastforestbinaryclassifier( obj=softmax_temperature, none_acceptable=True, is_of_type=numbers.Real) - if execution_times is not None: - inputs['ExecutionTimes'] = try_set( - obj=execution_times, + if execution_time is not None: + inputs['ExecutionTime'] = try_set( + obj=execution_time, none_acceptable=True, is_of_type=bool) if feature_fraction is not None: @@ -378,14 +375,14 @@ def trainers_fastforestbinaryclassifier( obj=bagging_size, none_acceptable=True, is_of_type=numbers.Real) - if bagging_train_fraction is not None: - inputs['BaggingTrainFraction'] = try_set( - obj=bagging_train_fraction, + if bagging_example_fraction is not None: + inputs['BaggingExampleFraction'] = try_set( + obj=bagging_example_fraction, none_acceptable=True, is_of_type=numbers.Real) - if split_fraction is not None: - inputs['SplitFraction'] = try_set( - obj=split_fraction, + if feature_fraction_per_split is not None: + inputs['FeatureFractionPerSplit'] = try_set( + obj=feature_fraction_per_split, none_acceptable=True, is_of_type=numbers.Real) if smoothing is not None: @@ -408,11 +405,6 @@ def trainers_fastforestbinaryclassifier( obj=compress_ensemble, none_acceptable=True, is_of_type=bool) - if max_trees_after_compression is not None: - inputs['MaxTreesAfterCompression'] = try_set( - obj=max_trees_after_compression, - none_acceptable=True, - is_of_type=numbers.Real) if print_test_graph is not None: inputs['PrintTestGraph'] = try_set( obj=print_test_graph, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_fastforestregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_fastforestregressor.py index bc6e0156..24fd47bc 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_fastforestregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_fastforestregressor.py @@ -12,48 +12,47 @@ def trainers_fastforestregressor( training_data, predictor_model=None, - num_trees=100, - num_leaves=20, - feature_column='Features', - min_documents_in_leafs=10, - label_column='Label', - weight_column=None, - group_id_column=None, + number_of_trees=100, + number_of_leaves=20, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', shuffle_labels=False, - quantile_sample_count=100, + number_of_quantile_samples=100, parallel_trainer=None, - num_threads=None, - rng_seed=123, - feature_select_seed=123, + number_of_threads=None, + seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_for_categorical_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - max_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, feature_first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_confidence_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=0.7, bagging_size=1, - bagging_train_fraction=0.7, - split_fraction=0.7, + bagging_example_fraction=0.7, + feature_fraction_per_split=0.7, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, print_test_graph=False, print_train_valid_graph=False, test_frequency=2147483647, @@ -62,34 +61,34 @@ def trainers_fastforestregressor( **Description** Trains a random forest to fit target values using least-squares. - :param num_trees: Total number of decision trees to create in the - ensemble (inputs). + :param number_of_trees: Total number of decision trees to create + in the ensemble (inputs). :param training_data: The data to be used for training (inputs). - :param num_leaves: The max number of leaves in each regression - tree (inputs). - :param feature_column: Column to use for features (inputs). - :param min_documents_in_leafs: The minimal number of documents - allowed in a leaf of a regression tree, out of the subsampled - data (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param number_of_leaves: The max number of leaves in each + regression tree (inputs). + :param feature_column_name: Column to use for features (inputs). + :param minimum_example_count_per_leaf: The minimal number of + examples allowed in a leaf of a regression tree, out of the + subsampled data (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param shuffle_labels: Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass. (inputs). - :param quantile_sample_count: Number of labels to be sampled from - each leaf to make the distribtuion (inputs). + :param number_of_quantile_samples: Number of labels to be sampled + from each leaf to make the distribution (inputs). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm (inputs). - :param num_threads: The number of threads to use (inputs). - :param rng_seed: The seed of the random number generator - (inputs). - :param feature_select_seed: The seed of the active feature + :param number_of_threads: The number of threads to use (inputs). + :param seed: The seed of the random number generator (inputs). + :param feature_selection_seed: The seed of the active feature selection (inputs). :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1 (inputs). @@ -102,27 +101,28 @@ def trainers_fastforestregressor( dataset preparation to speed up training (inputs). :param categorical_split: Whether to do split based on multiple categorical feature values. (inputs). - :param max_categorical_groups_per_node: Maximum categorical split - groups to consider when splitting on a categorical feature. - Split groups are a collection of split points. This is used - to reduce overfitting when there many categorical features. - (inputs). - :param max_categorical_split_points: Maximum categorical split - points to consider when splitting on a categorical feature. - (inputs). - :param min_docs_percentage_for_categorical_split: Minimum - categorical docs percentage in a bin to consider for a split. + :param maximum_categorical_group_count_per_node: Maximum + categorical split groups to consider when splitting on a + categorical feature. Split groups are a collection of split + points. This is used to reduce overfitting when there many + categorical features. (inputs). + :param maximum_categorical_split_point_count: Maximum categorical + split points to consider when splitting on a categorical + feature. (inputs). + :param minimum_example_fraction_for_categorical_split: Minimum + categorical example percentage in a bin to consider for a + split. (inputs). + :param minimum_examples_for_categorical_split: Minimum + categorical example count in a bin to consider for a split. (inputs). - :param min_docs_for_categorical_split: Minimum categorical doc - count in a bin to consider for a split. (inputs). :param bias: Bias for calculating gradient for each feature bin for a categorical feature. (inputs). :param bundling: Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. (inputs). - :param max_bins: Maximum number of distinct values (bins) per - feature (inputs). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (inputs). :param sparsify_threshold: Sparsity level needed to use sparse feature representation (inputs). :param feature_first_use_penalty: The feature first use penalty @@ -133,16 +133,16 @@ def trainers_fastforestregressor( requirement (should be in the range [0,1) ). (inputs). :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature (inputs). - :param execution_times: Print execution time breakdown to stdout + :param execution_time: Print execution time breakdown to stdout (inputs). :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration (inputs). :param bagging_size: Number of trees in each bag (0 for disabling bagging) (inputs). - :param bagging_train_fraction: Percentage of training examples + :param bagging_example_fraction: Percentage of training examples used in each bag (inputs). - :param split_fraction: The fraction of features (chosen randomly) - to use on each split (inputs). + :param feature_fraction_per_split: The fraction of features + (chosen randomly) to use on each split (inputs). :param smoothing: Smoothing paramter for tree regularization (inputs). :param allow_empty_trees: When a root split is impossible, allow @@ -150,8 +150,6 @@ def trainers_fastforestregressor( :param feature_compression_level: The level of feature compression to use (inputs). :param compress_ensemble: Compress the tree Ensemble (inputs). - :param max_trees_after_compression: Maximum Number of trees after - compression (inputs). :param print_test_graph: Print metrics graph for the first test set (inputs). :param print_train_valid_graph: Print Train and Validation @@ -165,9 +163,9 @@ def trainers_fastforestregressor( inputs = {} outputs = {} - if num_trees is not None: - inputs['NumTrees'] = try_set( - obj=num_trees, + if number_of_trees is not None: + inputs['NumberOfTrees'] = try_set( + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -175,37 +173,37 @@ def trainers_fastforestregressor( obj=training_data, none_acceptable=False, is_of_type=str) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents_in_leafs is not None: - inputs['MinDocumentsInLeafs'] = try_set( - obj=min_documents_in_leafs, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -227,16 +225,15 @@ def trainers_fastforestregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if shuffle_labels is not None: inputs['ShuffleLabels'] = try_set( obj=shuffle_labels, none_acceptable=True, is_of_type=bool) - if quantile_sample_count is not None: - inputs['QuantileSampleCount'] = try_set( - obj=quantile_sample_count, + if number_of_quantile_samples is not None: + inputs['NumberOfQuantileSamples'] = try_set( + obj=number_of_quantile_samples, none_acceptable=True, is_of_type=numbers.Real) if parallel_trainer is not None: @@ -244,19 +241,19 @@ def trainers_fastforestregressor( obj=parallel_trainer, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - inputs['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) - if feature_select_seed is not None: - inputs['FeatureSelectSeed'] = try_set( - obj=feature_select_seed, + if feature_selection_seed is not None: + inputs['FeatureSelectionSeed'] = try_set( + obj=feature_selection_seed, none_acceptable=True, is_of_type=numbers.Real) if entropy_coefficient is not None: @@ -284,24 +281,24 @@ def trainers_fastforestregressor( obj=categorical_split, none_acceptable=True, is_of_type=bool) - if max_categorical_groups_per_node is not None: - inputs['MaxCategoricalGroupsPerNode'] = try_set( - obj=max_categorical_groups_per_node, + if maximum_categorical_group_count_per_node is not None: + inputs['MaximumCategoricalGroupCountPerNode'] = try_set( + obj=maximum_categorical_group_count_per_node, none_acceptable=True, is_of_type=numbers.Real) - if max_categorical_split_points is not None: - inputs['MaxCategoricalSplitPoints'] = try_set( - obj=max_categorical_split_points, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_percentage_for_categorical_split is not None: - inputs['MinDocsPercentageForCategoricalSplit'] = try_set( - obj=min_docs_percentage_for_categorical_split, + if minimum_example_fraction_for_categorical_split is not None: + inputs['MinimumExampleFractionForCategoricalSplit'] = try_set( + obj=minimum_example_fraction_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_for_categorical_split is not None: - inputs['MinDocsForCategoricalSplit'] = try_set( - obj=min_docs_for_categorical_split, + if minimum_examples_for_categorical_split is not None: + inputs['MinimumExamplesForCategoricalSplit'] = try_set( + obj=minimum_examples_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) if bias is not None: @@ -318,9 +315,9 @@ def trainers_fastforestregressor( 'None', 'AggregateLowPopulation', 'Adjacent']) - if max_bins is not None: - inputs['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) if sparsify_threshold is not None: @@ -348,9 +345,9 @@ def trainers_fastforestregressor( obj=softmax_temperature, none_acceptable=True, is_of_type=numbers.Real) - if execution_times is not None: - inputs['ExecutionTimes'] = try_set( - obj=execution_times, + if execution_time is not None: + inputs['ExecutionTime'] = try_set( + obj=execution_time, none_acceptable=True, is_of_type=bool) if feature_fraction is not None: @@ -363,14 +360,14 @@ def trainers_fastforestregressor( obj=bagging_size, none_acceptable=True, is_of_type=numbers.Real) - if bagging_train_fraction is not None: - inputs['BaggingTrainFraction'] = try_set( - obj=bagging_train_fraction, + if bagging_example_fraction is not None: + inputs['BaggingExampleFraction'] = try_set( + obj=bagging_example_fraction, none_acceptable=True, is_of_type=numbers.Real) - if split_fraction is not None: - inputs['SplitFraction'] = try_set( - obj=split_fraction, + if feature_fraction_per_split is not None: + inputs['FeatureFractionPerSplit'] = try_set( + obj=feature_fraction_per_split, none_acceptable=True, is_of_type=numbers.Real) if smoothing is not None: @@ -393,11 +390,6 @@ def trainers_fastforestregressor( obj=compress_ensemble, none_acceptable=True, is_of_type=bool) - if max_trees_after_compression is not None: - inputs['MaxTreesAfterCompression'] = try_set( - obj=max_trees_after_compression, - none_acceptable=True, - is_of_type=numbers.Real) if print_test_graph is not None: inputs['PrintTestGraph'] = try_set( obj=print_test_graph, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_fasttreebinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_fasttreebinaryclassifier.py index 827d4cc0..21ce3bb8 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_fasttreebinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_fasttreebinaryclassifier.py @@ -12,24 +12,24 @@ def trainers_fasttreebinaryclassifier( training_data, predictor_model=None, - num_trees=100, - num_leaves=20, - feature_column='Features', - min_documents_in_leafs=10, - label_column='Label', - learning_rates=0.2, - weight_column=None, - group_id_column=None, + number_of_trees=100, + number_of_leaves=20, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + learning_rate=0.2, + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', unbalanced_sets=False, best_step_ranking_regression_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimization_algorithm='GradientDescent', early_stopping_rule=None, - early_stopping_metrics=0, + early_stopping_metrics=1, enable_pruning=False, use_tolerant_pruning=False, pruning_threshold=0.004, @@ -38,43 +38,42 @@ def trainers_fasttreebinaryclassifier( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - num_threads=None, - rng_seed=123, - feature_select_seed=123, + number_of_threads=None, + seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_for_categorical_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - max_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, feature_first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_confidence_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - bagging_train_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, print_test_graph=False, print_train_valid_graph=False, test_frequency=2147483647, @@ -84,33 +83,34 @@ def trainers_fasttreebinaryclassifier( Uses a logit-boost boosted tree learner to perform binary classification. - :param num_trees: Total number of decision trees to create in the - ensemble (inputs). + :param number_of_trees: Total number of decision trees to create + in the ensemble (inputs). :param training_data: The data to be used for training (inputs). - :param num_leaves: The max number of leaves in each regression - tree (inputs). - :param feature_column: Column to use for features (inputs). - :param min_documents_in_leafs: The minimal number of documents - allowed in a leaf of a regression tree, out of the subsampled - data (inputs). - :param label_column: Column to use for labels (inputs). - :param learning_rates: The learning rate (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param number_of_leaves: The max number of leaves in each + regression tree (inputs). + :param feature_column_name: Column to use for features (inputs). + :param minimum_example_count_per_leaf: The minimal number of + examples allowed in a leaf of a regression tree, out of the + subsampled data (inputs). + :param label_column_name: Column to use for labels (inputs). + :param learning_rate: The learning rate (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param unbalanced_sets: Should we use derivatives optimized for - unbalanced sets (inputs). - :param best_step_ranking_regression_trees: Use best regression - step trees? (inputs). + :param unbalanced_sets: Option for using derivatives optimized + for unbalanced sets (inputs). + :param best_step_ranking_regression_trees: Option for using best + regression step trees (inputs). :param use_line_search: Should we use line search for a step size (inputs). - :param num_post_bracket_steps: Number of post-bracket line search - steps (inputs). - :param min_step_size: Minimum line search step size (inputs). + :param maximum_number_of_line_search_steps: Number of post- + bracket line search steps (inputs). + :param minimum_step_size: Minimum line search step size (inputs). :param optimization_algorithm: Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) (inputs). :param early_stopping_rule: Early stopping rule. (Validation set @@ -133,8 +133,8 @@ def trainers_fasttreebinaryclassifier( times in the GetDerivatives function (inputs). :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping (inputs). - :param max_tree_output: Upper bound on absolute value of single - tree output (inputs). + :param maximum_tree_output: Upper bound on absolute value of + single tree output (inputs). :param random_start: Training starts from random ordering (determined by /r1) (inputs). :param filter_zero_lambdas: Filter zero lambdas during training @@ -144,15 +144,14 @@ def trainers_fasttreebinaryclassifier( :param baseline_alpha_risk: Baseline alpha for tradeoffs of risk (0 is normal training) (inputs). :param position_discount_freeform: The discount freeform which - specifies the per position discounts of documents in a query + specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position) (inputs). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm (inputs). - :param num_threads: The number of threads to use (inputs). - :param rng_seed: The seed of the random number generator - (inputs). - :param feature_select_seed: The seed of the active feature + :param number_of_threads: The number of threads to use (inputs). + :param seed: The seed of the random number generator (inputs). + :param feature_selection_seed: The seed of the active feature selection (inputs). :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1 (inputs). @@ -165,27 +164,28 @@ def trainers_fasttreebinaryclassifier( dataset preparation to speed up training (inputs). :param categorical_split: Whether to do split based on multiple categorical feature values. (inputs). - :param max_categorical_groups_per_node: Maximum categorical split - groups to consider when splitting on a categorical feature. - Split groups are a collection of split points. This is used - to reduce overfitting when there many categorical features. - (inputs). - :param max_categorical_split_points: Maximum categorical split - points to consider when splitting on a categorical feature. - (inputs). - :param min_docs_percentage_for_categorical_split: Minimum - categorical docs percentage in a bin to consider for a split. + :param maximum_categorical_group_count_per_node: Maximum + categorical split groups to consider when splitting on a + categorical feature. Split groups are a collection of split + points. This is used to reduce overfitting when there many + categorical features. (inputs). + :param maximum_categorical_split_point_count: Maximum categorical + split points to consider when splitting on a categorical + feature. (inputs). + :param minimum_example_fraction_for_categorical_split: Minimum + categorical example percentage in a bin to consider for a + split. (inputs). + :param minimum_examples_for_categorical_split: Minimum + categorical example count in a bin to consider for a split. (inputs). - :param min_docs_for_categorical_split: Minimum categorical doc - count in a bin to consider for a split. (inputs). :param bias: Bias for calculating gradient for each feature bin for a categorical feature. (inputs). :param bundling: Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. (inputs). - :param max_bins: Maximum number of distinct values (bins) per - feature (inputs). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (inputs). :param sparsify_threshold: Sparsity level needed to use sparse feature representation (inputs). :param feature_first_use_penalty: The feature first use penalty @@ -196,16 +196,16 @@ def trainers_fasttreebinaryclassifier( requirement (should be in the range [0,1) ). (inputs). :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature (inputs). - :param execution_times: Print execution time breakdown to stdout + :param execution_time: Print execution time breakdown to stdout (inputs). :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration (inputs). :param bagging_size: Number of trees in each bag (0 for disabling bagging) (inputs). - :param bagging_train_fraction: Percentage of training examples + :param bagging_example_fraction: Percentage of training examples used in each bag (inputs). - :param split_fraction: The fraction of features (chosen randomly) - to use on each split (inputs). + :param feature_fraction_per_split: The fraction of features + (chosen randomly) to use on each split (inputs). :param smoothing: Smoothing paramter for tree regularization (inputs). :param allow_empty_trees: When a root split is impossible, allow @@ -213,8 +213,6 @@ def trainers_fasttreebinaryclassifier( :param feature_compression_level: The level of feature compression to use (inputs). :param compress_ensemble: Compress the tree Ensemble (inputs). - :param max_trees_after_compression: Maximum Number of trees after - compression (inputs). :param print_test_graph: Print metrics graph for the first test set (inputs). :param print_train_valid_graph: Print Train and Validation @@ -228,9 +226,9 @@ def trainers_fasttreebinaryclassifier( inputs = {} outputs = {} - if num_trees is not None: - inputs['NumTrees'] = try_set( - obj=num_trees, + if number_of_trees is not None: + inputs['NumberOfTrees'] = try_set( + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -238,42 +236,42 @@ def trainers_fasttreebinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents_in_leafs is not None: - inputs['MinDocumentsInLeafs'] = try_set( - obj=min_documents_in_leafs, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if learning_rates is not None: - inputs['LearningRates'] = try_set( - obj=learning_rates, + if learning_rate is not None: + inputs['LearningRate'] = try_set( + obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -295,7 +293,6 @@ def trainers_fasttreebinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if unbalanced_sets is not None: inputs['UnbalancedSets'] = try_set( @@ -312,14 +309,14 @@ def trainers_fasttreebinaryclassifier( obj=use_line_search, none_acceptable=True, is_of_type=bool) - if num_post_bracket_steps is not None: - inputs['NumPostBracketSteps'] = try_set( - obj=num_post_bracket_steps, + if maximum_number_of_line_search_steps is not None: + inputs['MaximumNumberOfLineSearchSteps'] = try_set( + obj=maximum_number_of_line_search_steps, none_acceptable=True, is_of_type=numbers.Real) - if min_step_size is not None: - inputs['MinStepSize'] = try_set( - obj=min_step_size, + if minimum_step_size is not None: + inputs['MinimumStepSize'] = try_set( + obj=minimum_step_size, none_acceptable=True, is_of_type=numbers.Real) if optimization_algorithm is not None: @@ -381,9 +378,9 @@ def trainers_fasttreebinaryclassifier( obj=write_last_ensemble, none_acceptable=True, is_of_type=bool) - if max_tree_output is not None: - inputs['MaxTreeOutput'] = try_set( - obj=max_tree_output, + if maximum_tree_output is not None: + inputs['MaximumTreeOutput'] = try_set( + obj=maximum_tree_output, none_acceptable=True, is_of_type=numbers.Real) if random_start is not None: @@ -412,19 +409,19 @@ def trainers_fasttreebinaryclassifier( obj=parallel_trainer, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - inputs['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) - if feature_select_seed is not None: - inputs['FeatureSelectSeed'] = try_set( - obj=feature_select_seed, + if feature_selection_seed is not None: + inputs['FeatureSelectionSeed'] = try_set( + obj=feature_selection_seed, none_acceptable=True, is_of_type=numbers.Real) if entropy_coefficient is not None: @@ -452,24 +449,24 @@ def trainers_fasttreebinaryclassifier( obj=categorical_split, none_acceptable=True, is_of_type=bool) - if max_categorical_groups_per_node is not None: - inputs['MaxCategoricalGroupsPerNode'] = try_set( - obj=max_categorical_groups_per_node, + if maximum_categorical_group_count_per_node is not None: + inputs['MaximumCategoricalGroupCountPerNode'] = try_set( + obj=maximum_categorical_group_count_per_node, none_acceptable=True, is_of_type=numbers.Real) - if max_categorical_split_points is not None: - inputs['MaxCategoricalSplitPoints'] = try_set( - obj=max_categorical_split_points, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_percentage_for_categorical_split is not None: - inputs['MinDocsPercentageForCategoricalSplit'] = try_set( - obj=min_docs_percentage_for_categorical_split, + if minimum_example_fraction_for_categorical_split is not None: + inputs['MinimumExampleFractionForCategoricalSplit'] = try_set( + obj=minimum_example_fraction_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_for_categorical_split is not None: - inputs['MinDocsForCategoricalSplit'] = try_set( - obj=min_docs_for_categorical_split, + if minimum_examples_for_categorical_split is not None: + inputs['MinimumExamplesForCategoricalSplit'] = try_set( + obj=minimum_examples_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) if bias is not None: @@ -486,9 +483,9 @@ def trainers_fasttreebinaryclassifier( 'None', 'AggregateLowPopulation', 'Adjacent']) - if max_bins is not None: - inputs['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) if sparsify_threshold is not None: @@ -516,9 +513,9 @@ def trainers_fasttreebinaryclassifier( obj=softmax_temperature, none_acceptable=True, is_of_type=numbers.Real) - if execution_times is not None: - inputs['ExecutionTimes'] = try_set( - obj=execution_times, + if execution_time is not None: + inputs['ExecutionTime'] = try_set( + obj=execution_time, none_acceptable=True, is_of_type=bool) if feature_fraction is not None: @@ -531,14 +528,14 @@ def trainers_fasttreebinaryclassifier( obj=bagging_size, none_acceptable=True, is_of_type=numbers.Real) - if bagging_train_fraction is not None: - inputs['BaggingTrainFraction'] = try_set( - obj=bagging_train_fraction, + if bagging_example_fraction is not None: + inputs['BaggingExampleFraction'] = try_set( + obj=bagging_example_fraction, none_acceptable=True, is_of_type=numbers.Real) - if split_fraction is not None: - inputs['SplitFraction'] = try_set( - obj=split_fraction, + if feature_fraction_per_split is not None: + inputs['FeatureFractionPerSplit'] = try_set( + obj=feature_fraction_per_split, none_acceptable=True, is_of_type=numbers.Real) if smoothing is not None: @@ -561,11 +558,6 @@ def trainers_fasttreebinaryclassifier( obj=compress_ensemble, none_acceptable=True, is_of_type=bool) - if max_trees_after_compression is not None: - inputs['MaxTreesAfterCompression'] = try_set( - obj=max_trees_after_compression, - none_acceptable=True, - is_of_type=numbers.Real) if print_test_graph is not None: inputs['PrintTestGraph'] = try_set( obj=print_test_graph, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_fasttreeranker.py b/src/python/nimbusml/internal/entrypoints/trainers_fasttreeranker.py index 77b0499b..8af029e5 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_fasttreeranker.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_fasttreeranker.py @@ -12,7 +12,7 @@ def trainers_fasttreeranker( training_data, predictor_model=None, - num_trees=100, + number_of_trees=100, num_leaves=20, feature_column='Features', min_documents_in_leafs=10, @@ -91,7 +91,7 @@ def trainers_fasttreeranker( Trains gradient boosted decision trees to the LambdaRank quasi- gradient. - :param num_trees: Total number of decision trees to create in the + :param number_of_trees: Total number of decision trees to create in the ensemble (inputs). :param training_data: The data to be used for training (inputs). :param num_leaves: The max number of leaves in each regression @@ -123,8 +123,8 @@ def trainers_fasttreeranker( :param distance_weight2: Distance weight 2 adjustment to cost (inputs). :param normalize_query_lambdas: Normalize query lambdas (inputs). - :param best_step_ranking_regression_trees: Use best regression - step trees? (inputs). + :param best_step_ranking_regression_trees: Option for using best + regression step trees (inputs). :param use_line_search: Should we use line search for a step size (inputs). :param num_post_bracket_steps: Number of post-bracket line search @@ -247,9 +247,9 @@ def trainers_fasttreeranker( inputs = {} outputs = {} - if num_trees is not None: + if number_of_trees is not None: inputs['NumTrees'] = try_set( - obj=num_trees, + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -314,7 +314,6 @@ def trainers_fasttreeranker( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if custom_gains is not None: inputs['CustomGains'] = try_set( diff --git a/src/python/nimbusml/internal/entrypoints/trainers_fasttreeregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_fasttreeregressor.py index 6408d30c..9466eae3 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_fasttreeregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_fasttreeregressor.py @@ -12,20 +12,20 @@ def trainers_fasttreeregressor( training_data, predictor_model=None, - num_trees=100, - num_leaves=20, - feature_column='Features', - min_documents_in_leafs=10, - label_column='Label', - learning_rates=0.2, - weight_column=None, - group_id_column=None, + number_of_trees=100, + number_of_leaves=20, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + learning_rate=0.2, + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', best_step_ranking_regression_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimization_algorithm='GradientDescent', early_stopping_rule=None, early_stopping_metrics=1, @@ -37,43 +37,42 @@ def trainers_fasttreeregressor( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - num_threads=None, - rng_seed=123, - feature_select_seed=123, + number_of_threads=None, + seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_for_categorical_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - max_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, feature_first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_confidence_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - bagging_train_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, print_test_graph=False, print_train_valid_graph=False, test_frequency=2147483647, @@ -83,31 +82,32 @@ def trainers_fasttreeregressor( Trains gradient boosted decision trees to fit target values using least-squares. - :param num_trees: Total number of decision trees to create in the - ensemble (inputs). + :param number_of_trees: Total number of decision trees to create + in the ensemble (inputs). :param training_data: The data to be used for training (inputs). - :param num_leaves: The max number of leaves in each regression - tree (inputs). - :param feature_column: Column to use for features (inputs). - :param min_documents_in_leafs: The minimal number of documents - allowed in a leaf of a regression tree, out of the subsampled - data (inputs). - :param label_column: Column to use for labels (inputs). - :param learning_rates: The learning rate (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param number_of_leaves: The max number of leaves in each + regression tree (inputs). + :param feature_column_name: Column to use for features (inputs). + :param minimum_example_count_per_leaf: The minimal number of + examples allowed in a leaf of a regression tree, out of the + subsampled data (inputs). + :param label_column_name: Column to use for labels (inputs). + :param learning_rate: The learning rate (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param best_step_ranking_regression_trees: Use best regression - step trees? (inputs). + :param best_step_ranking_regression_trees: Option for using best + regression step trees (inputs). :param use_line_search: Should we use line search for a step size (inputs). - :param num_post_bracket_steps: Number of post-bracket line search - steps (inputs). - :param min_step_size: Minimum line search step size (inputs). + :param maximum_number_of_line_search_steps: Number of post- + bracket line search steps (inputs). + :param minimum_step_size: Minimum line search step size (inputs). :param optimization_algorithm: Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) (inputs). :param early_stopping_rule: Early stopping rule. (Validation set @@ -130,8 +130,8 @@ def trainers_fasttreeregressor( times in the GetDerivatives function (inputs). :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping (inputs). - :param max_tree_output: Upper bound on absolute value of single - tree output (inputs). + :param maximum_tree_output: Upper bound on absolute value of + single tree output (inputs). :param random_start: Training starts from random ordering (determined by /r1) (inputs). :param filter_zero_lambdas: Filter zero lambdas during training @@ -141,15 +141,14 @@ def trainers_fasttreeregressor( :param baseline_alpha_risk: Baseline alpha for tradeoffs of risk (0 is normal training) (inputs). :param position_discount_freeform: The discount freeform which - specifies the per position discounts of documents in a query + specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position) (inputs). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm (inputs). - :param num_threads: The number of threads to use (inputs). - :param rng_seed: The seed of the random number generator - (inputs). - :param feature_select_seed: The seed of the active feature + :param number_of_threads: The number of threads to use (inputs). + :param seed: The seed of the random number generator (inputs). + :param feature_selection_seed: The seed of the active feature selection (inputs). :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1 (inputs). @@ -162,27 +161,28 @@ def trainers_fasttreeregressor( dataset preparation to speed up training (inputs). :param categorical_split: Whether to do split based on multiple categorical feature values. (inputs). - :param max_categorical_groups_per_node: Maximum categorical split - groups to consider when splitting on a categorical feature. - Split groups are a collection of split points. This is used - to reduce overfitting when there many categorical features. - (inputs). - :param max_categorical_split_points: Maximum categorical split - points to consider when splitting on a categorical feature. - (inputs). - :param min_docs_percentage_for_categorical_split: Minimum - categorical docs percentage in a bin to consider for a split. + :param maximum_categorical_group_count_per_node: Maximum + categorical split groups to consider when splitting on a + categorical feature. Split groups are a collection of split + points. This is used to reduce overfitting when there many + categorical features. (inputs). + :param maximum_categorical_split_point_count: Maximum categorical + split points to consider when splitting on a categorical + feature. (inputs). + :param minimum_example_fraction_for_categorical_split: Minimum + categorical example percentage in a bin to consider for a + split. (inputs). + :param minimum_examples_for_categorical_split: Minimum + categorical example count in a bin to consider for a split. (inputs). - :param min_docs_for_categorical_split: Minimum categorical doc - count in a bin to consider for a split. (inputs). :param bias: Bias for calculating gradient for each feature bin for a categorical feature. (inputs). :param bundling: Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. (inputs). - :param max_bins: Maximum number of distinct values (bins) per - feature (inputs). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (inputs). :param sparsify_threshold: Sparsity level needed to use sparse feature representation (inputs). :param feature_first_use_penalty: The feature first use penalty @@ -193,16 +193,16 @@ def trainers_fasttreeregressor( requirement (should be in the range [0,1) ). (inputs). :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature (inputs). - :param execution_times: Print execution time breakdown to stdout + :param execution_time: Print execution time breakdown to stdout (inputs). :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration (inputs). :param bagging_size: Number of trees in each bag (0 for disabling bagging) (inputs). - :param bagging_train_fraction: Percentage of training examples + :param bagging_example_fraction: Percentage of training examples used in each bag (inputs). - :param split_fraction: The fraction of features (chosen randomly) - to use on each split (inputs). + :param feature_fraction_per_split: The fraction of features + (chosen randomly) to use on each split (inputs). :param smoothing: Smoothing paramter for tree regularization (inputs). :param allow_empty_trees: When a root split is impossible, allow @@ -210,8 +210,6 @@ def trainers_fasttreeregressor( :param feature_compression_level: The level of feature compression to use (inputs). :param compress_ensemble: Compress the tree Ensemble (inputs). - :param max_trees_after_compression: Maximum Number of trees after - compression (inputs). :param print_test_graph: Print metrics graph for the first test set (inputs). :param print_train_valid_graph: Print Train and Validation @@ -225,9 +223,9 @@ def trainers_fasttreeregressor( inputs = {} outputs = {} - if num_trees is not None: - inputs['NumTrees'] = try_set( - obj=num_trees, + if number_of_trees is not None: + inputs['NumberOfTrees'] = try_set( + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -235,42 +233,42 @@ def trainers_fasttreeregressor( obj=training_data, none_acceptable=False, is_of_type=str) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents_in_leafs is not None: - inputs['MinDocumentsInLeafs'] = try_set( - obj=min_documents_in_leafs, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if learning_rates is not None: - inputs['LearningRates'] = try_set( - obj=learning_rates, + if learning_rate is not None: + inputs['LearningRate'] = try_set( + obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -292,7 +290,6 @@ def trainers_fasttreeregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if best_step_ranking_regression_trees is not None: inputs['BestStepRankingRegressionTrees'] = try_set( @@ -304,14 +301,14 @@ def trainers_fasttreeregressor( obj=use_line_search, none_acceptable=True, is_of_type=bool) - if num_post_bracket_steps is not None: - inputs['NumPostBracketSteps'] = try_set( - obj=num_post_bracket_steps, + if maximum_number_of_line_search_steps is not None: + inputs['MaximumNumberOfLineSearchSteps'] = try_set( + obj=maximum_number_of_line_search_steps, none_acceptable=True, is_of_type=numbers.Real) - if min_step_size is not None: - inputs['MinStepSize'] = try_set( - obj=min_step_size, + if minimum_step_size is not None: + inputs['MinimumStepSize'] = try_set( + obj=minimum_step_size, none_acceptable=True, is_of_type=numbers.Real) if optimization_algorithm is not None: @@ -373,9 +370,9 @@ def trainers_fasttreeregressor( obj=write_last_ensemble, none_acceptable=True, is_of_type=bool) - if max_tree_output is not None: - inputs['MaxTreeOutput'] = try_set( - obj=max_tree_output, + if maximum_tree_output is not None: + inputs['MaximumTreeOutput'] = try_set( + obj=maximum_tree_output, none_acceptable=True, is_of_type=numbers.Real) if random_start is not None: @@ -404,19 +401,19 @@ def trainers_fasttreeregressor( obj=parallel_trainer, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - inputs['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) - if feature_select_seed is not None: - inputs['FeatureSelectSeed'] = try_set( - obj=feature_select_seed, + if feature_selection_seed is not None: + inputs['FeatureSelectionSeed'] = try_set( + obj=feature_selection_seed, none_acceptable=True, is_of_type=numbers.Real) if entropy_coefficient is not None: @@ -444,24 +441,24 @@ def trainers_fasttreeregressor( obj=categorical_split, none_acceptable=True, is_of_type=bool) - if max_categorical_groups_per_node is not None: - inputs['MaxCategoricalGroupsPerNode'] = try_set( - obj=max_categorical_groups_per_node, + if maximum_categorical_group_count_per_node is not None: + inputs['MaximumCategoricalGroupCountPerNode'] = try_set( + obj=maximum_categorical_group_count_per_node, none_acceptable=True, is_of_type=numbers.Real) - if max_categorical_split_points is not None: - inputs['MaxCategoricalSplitPoints'] = try_set( - obj=max_categorical_split_points, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_percentage_for_categorical_split is not None: - inputs['MinDocsPercentageForCategoricalSplit'] = try_set( - obj=min_docs_percentage_for_categorical_split, + if minimum_example_fraction_for_categorical_split is not None: + inputs['MinimumExampleFractionForCategoricalSplit'] = try_set( + obj=minimum_example_fraction_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_for_categorical_split is not None: - inputs['MinDocsForCategoricalSplit'] = try_set( - obj=min_docs_for_categorical_split, + if minimum_examples_for_categorical_split is not None: + inputs['MinimumExamplesForCategoricalSplit'] = try_set( + obj=minimum_examples_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) if bias is not None: @@ -478,9 +475,9 @@ def trainers_fasttreeregressor( 'None', 'AggregateLowPopulation', 'Adjacent']) - if max_bins is not None: - inputs['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) if sparsify_threshold is not None: @@ -508,9 +505,9 @@ def trainers_fasttreeregressor( obj=softmax_temperature, none_acceptable=True, is_of_type=numbers.Real) - if execution_times is not None: - inputs['ExecutionTimes'] = try_set( - obj=execution_times, + if execution_time is not None: + inputs['ExecutionTime'] = try_set( + obj=execution_time, none_acceptable=True, is_of_type=bool) if feature_fraction is not None: @@ -523,14 +520,14 @@ def trainers_fasttreeregressor( obj=bagging_size, none_acceptable=True, is_of_type=numbers.Real) - if bagging_train_fraction is not None: - inputs['BaggingTrainFraction'] = try_set( - obj=bagging_train_fraction, + if bagging_example_fraction is not None: + inputs['BaggingExampleFraction'] = try_set( + obj=bagging_example_fraction, none_acceptable=True, is_of_type=numbers.Real) - if split_fraction is not None: - inputs['SplitFraction'] = try_set( - obj=split_fraction, + if feature_fraction_per_split is not None: + inputs['FeatureFractionPerSplit'] = try_set( + obj=feature_fraction_per_split, none_acceptable=True, is_of_type=numbers.Real) if smoothing is not None: @@ -553,11 +550,6 @@ def trainers_fasttreeregressor( obj=compress_ensemble, none_acceptable=True, is_of_type=bool) - if max_trees_after_compression is not None: - inputs['MaxTreesAfterCompression'] = try_set( - obj=max_trees_after_compression, - none_acceptable=True, - is_of_type=numbers.Real) if print_test_graph is not None: inputs['PrintTestGraph'] = try_set( obj=print_test_graph, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_fasttreetweedieregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_fasttreetweedieregressor.py index f46aa6b8..d7a2807a 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_fasttreetweedieregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_fasttreetweedieregressor.py @@ -12,24 +12,24 @@ def trainers_fasttreetweedieregressor( training_data, predictor_model=None, - num_trees=100, - num_leaves=20, - feature_column='Features', - min_documents_in_leafs=10, - label_column='Label', - learning_rates=0.2, - weight_column=None, - group_id_column=None, + number_of_trees=100, + number_of_leaves=20, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + learning_rate=0.2, + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', index=1.5, best_step_ranking_regression_trees=False, use_line_search=False, - num_post_bracket_steps=0, - min_step_size=0.0, + maximum_number_of_line_search_steps=0, + minimum_step_size=0.0, optimization_algorithm='GradientDescent', early_stopping_rule=None, - early_stopping_metrics=0, + early_stopping_metrics=1, enable_pruning=False, use_tolerant_pruning=False, pruning_threshold=0.004, @@ -38,43 +38,42 @@ def trainers_fasttreetweedieregressor( dropout_rate=0.0, get_derivatives_sample_rate=1, write_last_ensemble=False, - max_tree_output=100.0, + maximum_tree_output=100.0, random_start=False, filter_zero_lambdas=False, baseline_scores_formula=None, baseline_alpha_risk=None, position_discount_freeform=None, parallel_trainer=None, - num_threads=None, - rng_seed=123, - feature_select_seed=123, + number_of_threads=None, + seed=123, + feature_selection_seed=123, entropy_coefficient=0.0, histogram_pool_size=-1, disk_transpose=None, feature_flocks=True, categorical_split=False, - max_categorical_groups_per_node=64, - max_categorical_split_points=64, - min_docs_percentage_for_categorical_split=0.001, - min_docs_for_categorical_split=100, + maximum_categorical_group_count_per_node=64, + maximum_categorical_split_point_count=64, + minimum_example_fraction_for_categorical_split=0.001, + minimum_examples_for_categorical_split=100, bias=0.0, bundling='None', - max_bins=255, + maximum_bin_count_per_feature=255, sparsify_threshold=0.7, feature_first_use_penalty=0.0, feature_reuse_penalty=0.0, gain_confidence_level=0.0, softmax_temperature=0.0, - execution_times=False, + execution_time=False, feature_fraction=1.0, bagging_size=0, - bagging_train_fraction=0.7, - split_fraction=1.0, + bagging_example_fraction=0.7, + feature_fraction_per_split=1.0, smoothing=0.0, allow_empty_trees=True, feature_compression_level=1, compress_ensemble=False, - max_trees_after_compression=-1, print_test_graph=False, print_train_valid_graph=False, test_frequency=2147483647, @@ -85,34 +84,35 @@ def trainers_fasttreetweedieregressor( Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression. - :param num_trees: Total number of decision trees to create in the - ensemble (inputs). + :param number_of_trees: Total number of decision trees to create + in the ensemble (inputs). :param training_data: The data to be used for training (inputs). - :param num_leaves: The max number of leaves in each regression - tree (inputs). - :param feature_column: Column to use for features (inputs). - :param min_documents_in_leafs: The minimal number of documents - allowed in a leaf of a regression tree, out of the subsampled - data (inputs). - :param label_column: Column to use for labels (inputs). - :param learning_rates: The learning rate (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param number_of_leaves: The max number of leaves in each + regression tree (inputs). + :param feature_column_name: Column to use for features (inputs). + :param minimum_example_count_per_leaf: The minimal number of + examples allowed in a leaf of a regression tree, out of the + subsampled data (inputs). + :param label_column_name: Column to use for labels (inputs). + :param learning_rate: The learning rate (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param index: Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss. (inputs). - :param best_step_ranking_regression_trees: Use best regression - step trees? (inputs). + :param best_step_ranking_regression_trees: Option for using best + regression step trees (inputs). :param use_line_search: Should we use line search for a step size (inputs). - :param num_post_bracket_steps: Number of post-bracket line search - steps (inputs). - :param min_step_size: Minimum line search step size (inputs). + :param maximum_number_of_line_search_steps: Number of post- + bracket line search steps (inputs). + :param minimum_step_size: Minimum line search step size (inputs). :param optimization_algorithm: Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) (inputs). :param early_stopping_rule: Early stopping rule. (Validation set @@ -135,8 +135,8 @@ def trainers_fasttreetweedieregressor( times in the GetDerivatives function (inputs). :param write_last_ensemble: Write the last ensemble instead of the one determined by early stopping (inputs). - :param max_tree_output: Upper bound on absolute value of single - tree output (inputs). + :param maximum_tree_output: Upper bound on absolute value of + single tree output (inputs). :param random_start: Training starts from random ordering (determined by /r1) (inputs). :param filter_zero_lambdas: Filter zero lambdas during training @@ -146,15 +146,14 @@ def trainers_fasttreetweedieregressor( :param baseline_alpha_risk: Baseline alpha for tradeoffs of risk (0 is normal training) (inputs). :param position_discount_freeform: The discount freeform which - specifies the per position discounts of documents in a query + specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position) (inputs). :param parallel_trainer: Allows to choose Parallel FastTree Learning Algorithm (inputs). - :param num_threads: The number of threads to use (inputs). - :param rng_seed: The seed of the random number generator - (inputs). - :param feature_select_seed: The seed of the active feature + :param number_of_threads: The number of threads to use (inputs). + :param seed: The seed of the random number generator (inputs). + :param feature_selection_seed: The seed of the active feature selection (inputs). :param entropy_coefficient: The entropy (regularization) coefficient between 0 and 1 (inputs). @@ -167,27 +166,28 @@ def trainers_fasttreetweedieregressor( dataset preparation to speed up training (inputs). :param categorical_split: Whether to do split based on multiple categorical feature values. (inputs). - :param max_categorical_groups_per_node: Maximum categorical split - groups to consider when splitting on a categorical feature. - Split groups are a collection of split points. This is used - to reduce overfitting when there many categorical features. - (inputs). - :param max_categorical_split_points: Maximum categorical split - points to consider when splitting on a categorical feature. - (inputs). - :param min_docs_percentage_for_categorical_split: Minimum - categorical docs percentage in a bin to consider for a split. + :param maximum_categorical_group_count_per_node: Maximum + categorical split groups to consider when splitting on a + categorical feature. Split groups are a collection of split + points. This is used to reduce overfitting when there many + categorical features. (inputs). + :param maximum_categorical_split_point_count: Maximum categorical + split points to consider when splitting on a categorical + feature. (inputs). + :param minimum_example_fraction_for_categorical_split: Minimum + categorical example percentage in a bin to consider for a + split. (inputs). + :param minimum_examples_for_categorical_split: Minimum + categorical example count in a bin to consider for a split. (inputs). - :param min_docs_for_categorical_split: Minimum categorical doc - count in a bin to consider for a split. (inputs). :param bias: Bias for calculating gradient for each feature bin for a categorical feature. (inputs). :param bundling: Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. (inputs). - :param max_bins: Maximum number of distinct values (bins) per - feature (inputs). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (inputs). :param sparsify_threshold: Sparsity level needed to use sparse feature representation (inputs). :param feature_first_use_penalty: The feature first use penalty @@ -198,16 +198,16 @@ def trainers_fasttreetweedieregressor( requirement (should be in the range [0,1) ). (inputs). :param softmax_temperature: The temperature of the randomized softmax distribution for choosing the feature (inputs). - :param execution_times: Print execution time breakdown to stdout + :param execution_time: Print execution time breakdown to stdout (inputs). :param feature_fraction: The fraction of features (chosen randomly) to use on each iteration (inputs). :param bagging_size: Number of trees in each bag (0 for disabling bagging) (inputs). - :param bagging_train_fraction: Percentage of training examples + :param bagging_example_fraction: Percentage of training examples used in each bag (inputs). - :param split_fraction: The fraction of features (chosen randomly) - to use on each split (inputs). + :param feature_fraction_per_split: The fraction of features + (chosen randomly) to use on each split (inputs). :param smoothing: Smoothing paramter for tree regularization (inputs). :param allow_empty_trees: When a root split is impossible, allow @@ -215,8 +215,6 @@ def trainers_fasttreetweedieregressor( :param feature_compression_level: The level of feature compression to use (inputs). :param compress_ensemble: Compress the tree Ensemble (inputs). - :param max_trees_after_compression: Maximum Number of trees after - compression (inputs). :param print_test_graph: Print metrics graph for the first test set (inputs). :param print_train_valid_graph: Print Train and Validation @@ -230,9 +228,9 @@ def trainers_fasttreetweedieregressor( inputs = {} outputs = {} - if num_trees is not None: - inputs['NumTrees'] = try_set( - obj=num_trees, + if number_of_trees is not None: + inputs['NumberOfTrees'] = try_set( + obj=number_of_trees, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -240,42 +238,42 @@ def trainers_fasttreetweedieregressor( obj=training_data, none_acceptable=False, is_of_type=str) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents_in_leafs is not None: - inputs['MinDocumentsInLeafs'] = try_set( - obj=min_documents_in_leafs, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if learning_rates is not None: - inputs['LearningRates'] = try_set( - obj=learning_rates, + if learning_rate is not None: + inputs['LearningRate'] = try_set( + obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -297,7 +295,6 @@ def trainers_fasttreetweedieregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if index is not None: inputs['Index'] = try_set( @@ -314,14 +311,14 @@ def trainers_fasttreetweedieregressor( obj=use_line_search, none_acceptable=True, is_of_type=bool) - if num_post_bracket_steps is not None: - inputs['NumPostBracketSteps'] = try_set( - obj=num_post_bracket_steps, + if maximum_number_of_line_search_steps is not None: + inputs['MaximumNumberOfLineSearchSteps'] = try_set( + obj=maximum_number_of_line_search_steps, none_acceptable=True, is_of_type=numbers.Real) - if min_step_size is not None: - inputs['MinStepSize'] = try_set( - obj=min_step_size, + if minimum_step_size is not None: + inputs['MinimumStepSize'] = try_set( + obj=minimum_step_size, none_acceptable=True, is_of_type=numbers.Real) if optimization_algorithm is not None: @@ -383,9 +380,9 @@ def trainers_fasttreetweedieregressor( obj=write_last_ensemble, none_acceptable=True, is_of_type=bool) - if max_tree_output is not None: - inputs['MaxTreeOutput'] = try_set( - obj=max_tree_output, + if maximum_tree_output is not None: + inputs['MaximumTreeOutput'] = try_set( + obj=maximum_tree_output, none_acceptable=True, is_of_type=numbers.Real) if random_start is not None: @@ -414,19 +411,19 @@ def trainers_fasttreetweedieregressor( obj=parallel_trainer, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - inputs['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) - if feature_select_seed is not None: - inputs['FeatureSelectSeed'] = try_set( - obj=feature_select_seed, + if feature_selection_seed is not None: + inputs['FeatureSelectionSeed'] = try_set( + obj=feature_selection_seed, none_acceptable=True, is_of_type=numbers.Real) if entropy_coefficient is not None: @@ -454,24 +451,24 @@ def trainers_fasttreetweedieregressor( obj=categorical_split, none_acceptable=True, is_of_type=bool) - if max_categorical_groups_per_node is not None: - inputs['MaxCategoricalGroupsPerNode'] = try_set( - obj=max_categorical_groups_per_node, + if maximum_categorical_group_count_per_node is not None: + inputs['MaximumCategoricalGroupCountPerNode'] = try_set( + obj=maximum_categorical_group_count_per_node, none_acceptable=True, is_of_type=numbers.Real) - if max_categorical_split_points is not None: - inputs['MaxCategoricalSplitPoints'] = try_set( - obj=max_categorical_split_points, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_percentage_for_categorical_split is not None: - inputs['MinDocsPercentageForCategoricalSplit'] = try_set( - obj=min_docs_percentage_for_categorical_split, + if minimum_example_fraction_for_categorical_split is not None: + inputs['MinimumExampleFractionForCategoricalSplit'] = try_set( + obj=minimum_example_fraction_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) - if min_docs_for_categorical_split is not None: - inputs['MinDocsForCategoricalSplit'] = try_set( - obj=min_docs_for_categorical_split, + if minimum_examples_for_categorical_split is not None: + inputs['MinimumExamplesForCategoricalSplit'] = try_set( + obj=minimum_examples_for_categorical_split, none_acceptable=True, is_of_type=numbers.Real) if bias is not None: @@ -488,9 +485,9 @@ def trainers_fasttreetweedieregressor( 'None', 'AggregateLowPopulation', 'Adjacent']) - if max_bins is not None: - inputs['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) if sparsify_threshold is not None: @@ -518,9 +515,9 @@ def trainers_fasttreetweedieregressor( obj=softmax_temperature, none_acceptable=True, is_of_type=numbers.Real) - if execution_times is not None: - inputs['ExecutionTimes'] = try_set( - obj=execution_times, + if execution_time is not None: + inputs['ExecutionTime'] = try_set( + obj=execution_time, none_acceptable=True, is_of_type=bool) if feature_fraction is not None: @@ -533,14 +530,14 @@ def trainers_fasttreetweedieregressor( obj=bagging_size, none_acceptable=True, is_of_type=numbers.Real) - if bagging_train_fraction is not None: - inputs['BaggingTrainFraction'] = try_set( - obj=bagging_train_fraction, + if bagging_example_fraction is not None: + inputs['BaggingExampleFraction'] = try_set( + obj=bagging_example_fraction, none_acceptable=True, is_of_type=numbers.Real) - if split_fraction is not None: - inputs['SplitFraction'] = try_set( - obj=split_fraction, + if feature_fraction_per_split is not None: + inputs['FeatureFractionPerSplit'] = try_set( + obj=feature_fraction_per_split, none_acceptable=True, is_of_type=numbers.Real) if smoothing is not None: @@ -563,11 +560,6 @@ def trainers_fasttreetweedieregressor( obj=compress_ensemble, none_acceptable=True, is_of_type=bool) - if max_trees_after_compression is not None: - inputs['MaxTreesAfterCompression'] = try_set( - obj=max_trees_after_compression, - none_acceptable=True, - is_of_type=numbers.Real) if print_test_graph is not None: inputs['PrintTestGraph'] = try_set( obj=print_test_graph, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_fieldawarefactorizationmachinebinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_fieldawarefactorizationmachinebinaryclassifier.py index 95ff5dc3..59a2f627 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_fieldawarefactorizationmachinebinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_fieldawarefactorizationmachinebinaryclassifier.py @@ -13,15 +13,16 @@ def trainers_fieldawarefactorizationmachinebinaryclassifier( training_data, predictor_model=None, learning_rate=0.1, - iters=5, - feature_column='Features', - latent_dim=20, - label_column='Label', + number_of_iterations=5, + feature_column_name='Features', + latent_dimension=20, + label_column_name='Label', lambda_linear=0.0001, + example_weight_column_name=None, lambda_latent=0.0001, - normalize_features='Auto', - norm=True, + normalize_features=True, caching='Auto', + extra_feature_columns=None, shuffle=True, verbose=True, radius=0.5, @@ -32,20 +33,26 @@ def trainers_fieldawarefactorizationmachinebinaryclassifier( :param learning_rate: Initial learning rate (inputs). :param training_data: The data to be used for training (inputs). - :param iters: Number of training iterations (inputs). - :param feature_column: Column to use for features (inputs). - :param latent_dim: Latent space dimension (inputs). - :param label_column: Column to use for labels (inputs). + :param number_of_iterations: Number of training iterations + (inputs). + :param feature_column_name: Column to use for features (inputs). + :param latent_dimension: Latent space dimension (inputs). + :param label_column_name: Column to use for labels (inputs). :param lambda_linear: Regularization coefficient of linear weights (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param lambda_latent: Regularization coefficient of latent weights (inputs). - :param normalize_features: Normalize option for the feature - column (inputs). - :param norm: Whether to normalize the input vectors so that the - concatenation of all fields' feature vectors is unit-length + :param normalize_features: Whether to normalize the input vectors + so that the concatenation of all fields' feature vectors is + unit-length (inputs). + :param caching: Whether trainer should cache input training data (inputs). - :param caching: Whether learner should cache input training data + :param extra_feature_columns: Extra columns to use for feature + vectors. The i-th specified string denotes the column + containing features form the (i+1)-th field. Note that the + first field is specified by "feat" instead of "exfeat". (inputs). :param shuffle: Whether to shuffle for each training iteration (inputs). @@ -68,25 +75,25 @@ def trainers_fieldawarefactorizationmachinebinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if iters is not None: - inputs['Iters'] = try_set( - obj=iters, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if latent_dim is not None: - inputs['LatentDim'] = try_set( - obj=latent_dim, + if latent_dimension is not None: + inputs['LatentDimension'] = try_set( + obj=latent_dimension, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -95,6 +102,12 @@ def trainers_fieldawarefactorizationmachinebinaryclassifier( obj=lambda_linear, none_acceptable=True, is_of_type=numbers.Real) + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) if lambda_latent is not None: inputs['LambdaLatent'] = try_set( obj=lambda_latent, @@ -102,19 +115,7 @@ def trainers_fieldawarefactorizationmachinebinaryclassifier( is_of_type=numbers.Real) if normalize_features is not None: inputs['NormalizeFeatures'] = try_set( - obj=normalize_features, - none_acceptable=True, - is_of_type=str, - values=[ - 'No', - 'Warn', - 'Auto', - 'Yes']) - if norm is not None: - inputs['Norm'] = try_set( - obj=norm, - none_acceptable=True, - is_of_type=bool) + obj=normalize_features, none_acceptable=True, is_of_type=bool) if caching is not None: inputs['Caching'] = try_set( obj=caching, @@ -123,8 +124,13 @@ def trainers_fieldawarefactorizationmachinebinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) + if extra_feature_columns is not None: + inputs['ExtraFeatureColumns'] = try_set( + obj=extra_feature_columns, + none_acceptable=True, + is_of_type=list, + is_column=True) if shuffle is not None: inputs['Shuffle'] = try_set( obj=shuffle, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py index 468d1c05..e5b62a23 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py @@ -12,23 +12,23 @@ def trainers_generalizedadditivemodelbinaryclassifier( training_data, predictor_model=None, - num_iterations=9500, - feature_column='Features', - min_documents=10, - label_column='Label', - learning_rates=0.002, - weight_column=None, + number_of_iterations=9500, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + learning_rate=0.002, + example_weight_column_name=None, normalize_features='Auto', caching='Auto', unbalanced_sets=False, entropy_coefficient=0.0, gain_confidence_level=0, - num_threads=None, + number_of_threads=None, disk_transpose=None, - max_bins=255, - max_output=float("inf"), + maximum_bin_count_per_feature=255, + maximum_tree_output=float("inf"), get_derivatives_sample_rate=1, - rng_seed=123, + seed=123, feature_flocks=True, enable_pruning=True, **params): @@ -38,18 +38,19 @@ def trainers_generalizedadditivemodelbinaryclassifier( simultaneously, to fit target values using least-squares. It mantains no interactions between features. - :param num_iterations: Total number of iterations over all + :param number_of_iterations: Total number of iterations over all features (inputs). :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param min_documents: Minimum number of training instances - required to form a partition (inputs). - :param label_column: Column to use for labels (inputs). - :param learning_rates: The learning rate (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param minimum_example_count_per_leaf: Minimum number of training + instances required to form a partition (inputs). + :param label_column_name: Column to use for labels (inputs). + :param learning_rate: The learning rate (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param unbalanced_sets: Should we use derivatives optimized for unbalanced sets (inputs). @@ -57,18 +58,17 @@ def trainers_generalizedadditivemodelbinaryclassifier( coefficient between 0 and 1 (inputs). :param gain_confidence_level: Tree fitting gain confidence requirement (should be in the range [0,1) ). (inputs). - :param num_threads: The number of threads to use (inputs). + :param number_of_threads: The number of threads to use (inputs). :param disk_transpose: Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose (inputs). - :param max_bins: Maximum number of distinct values (bins) per - feature (inputs). - :param max_output: Upper bound on absolute value of single output - (inputs). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (inputs). + :param maximum_tree_output: Upper bound on absolute value of + single output (inputs). :param get_derivatives_sample_rate: Sample each query 1 in k times in the GetDerivatives function (inputs). - :param rng_seed: The seed of the random number generator - (inputs). + :param seed: The seed of the random number generator (inputs). :param feature_flocks: Whether to collectivize features during dataset preparation to speed up training (inputs). :param enable_pruning: Enable post-training pruning to avoid @@ -80,9 +80,9 @@ def trainers_generalizedadditivemodelbinaryclassifier( inputs = {} outputs = {} - if num_iterations is not None: - inputs['NumIterations'] = try_set( - obj=num_iterations, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -90,31 +90,31 @@ def trainers_generalizedadditivemodelbinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents is not None: - inputs['MinDocuments'] = try_set( - obj=min_documents, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if learning_rates is not None: - inputs['LearningRates'] = try_set( - obj=learning_rates, + if learning_rate is not None: + inputs['LearningRate'] = try_set( + obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -136,7 +136,6 @@ def trainers_generalizedadditivemodelbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if unbalanced_sets is not None: inputs['UnbalancedSets'] = try_set( @@ -153,9 +152,9 @@ def trainers_generalizedadditivemodelbinaryclassifier( obj=gain_confidence_level, none_acceptable=True, is_of_type=numbers.Real) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) if disk_transpose is not None: @@ -163,14 +162,14 @@ def trainers_generalizedadditivemodelbinaryclassifier( obj=disk_transpose, none_acceptable=True, is_of_type=bool) - if max_bins is not None: - inputs['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) - if max_output is not None: - inputs['MaxOutput'] = try_set( - obj=max_output, + if maximum_tree_output is not None: + inputs['MaximumTreeOutput'] = try_set( + obj=maximum_tree_output, none_acceptable=True, is_of_type=numbers.Real) if get_derivatives_sample_rate is not None: @@ -178,9 +177,9 @@ def trainers_generalizedadditivemodelbinaryclassifier( obj=get_derivatives_sample_rate, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - inputs['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) if feature_flocks is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py index ab5512ee..1c56a706 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py @@ -12,23 +12,23 @@ def trainers_generalizedadditivemodelregressor( training_data, predictor_model=None, - num_iterations=9500, - feature_column='Features', - min_documents=10, - label_column='Label', - learning_rates=0.002, - weight_column=None, + number_of_iterations=9500, + feature_column_name='Features', + minimum_example_count_per_leaf=10, + label_column_name='Label', + learning_rate=0.002, + example_weight_column_name=None, normalize_features='Auto', caching='Auto', pruning_metrics=2, entropy_coefficient=0.0, gain_confidence_level=0, - num_threads=None, + number_of_threads=None, disk_transpose=None, - max_bins=255, - max_output=float("inf"), + maximum_bin_count_per_feature=255, + maximum_tree_output=float("inf"), get_derivatives_sample_rate=1, - rng_seed=123, + seed=123, feature_flocks=True, enable_pruning=True, **params): @@ -38,18 +38,19 @@ def trainers_generalizedadditivemodelregressor( simultaneously, to fit target values using least-squares. It mantains no interactions between features. - :param num_iterations: Total number of iterations over all + :param number_of_iterations: Total number of iterations over all features (inputs). :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param min_documents: Minimum number of training instances - required to form a partition (inputs). - :param label_column: Column to use for labels (inputs). - :param learning_rates: The learning rate (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param minimum_example_count_per_leaf: Minimum number of training + instances required to form a partition (inputs). + :param label_column_name: Column to use for labels (inputs). + :param learning_rate: The learning rate (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param pruning_metrics: Metric for pruning. (For regression, 1: L1, 2:L2; default L2) (inputs). @@ -57,18 +58,17 @@ def trainers_generalizedadditivemodelregressor( coefficient between 0 and 1 (inputs). :param gain_confidence_level: Tree fitting gain confidence requirement (should be in the range [0,1) ). (inputs). - :param num_threads: The number of threads to use (inputs). + :param number_of_threads: The number of threads to use (inputs). :param disk_transpose: Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose (inputs). - :param max_bins: Maximum number of distinct values (bins) per - feature (inputs). - :param max_output: Upper bound on absolute value of single output - (inputs). + :param maximum_bin_count_per_feature: Maximum number of distinct + values (bins) per feature (inputs). + :param maximum_tree_output: Upper bound on absolute value of + single output (inputs). :param get_derivatives_sample_rate: Sample each query 1 in k times in the GetDerivatives function (inputs). - :param rng_seed: The seed of the random number generator - (inputs). + :param seed: The seed of the random number generator (inputs). :param feature_flocks: Whether to collectivize features during dataset preparation to speed up training (inputs). :param enable_pruning: Enable post-training pruning to avoid @@ -80,9 +80,9 @@ def trainers_generalizedadditivemodelregressor( inputs = {} outputs = {} - if num_iterations is not None: - inputs['NumIterations'] = try_set( - obj=num_iterations, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -90,31 +90,31 @@ def trainers_generalizedadditivemodelregressor( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if min_documents is not None: - inputs['MinDocuments'] = try_set( - obj=min_documents, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if learning_rates is not None: - inputs['LearningRates'] = try_set( - obj=learning_rates, + if learning_rate is not None: + inputs['LearningRate'] = try_set( + obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -136,7 +136,6 @@ def trainers_generalizedadditivemodelregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if pruning_metrics is not None: inputs['PruningMetrics'] = try_set( @@ -153,9 +152,9 @@ def trainers_generalizedadditivemodelregressor( obj=gain_confidence_level, none_acceptable=True, is_of_type=numbers.Real) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) if disk_transpose is not None: @@ -163,14 +162,14 @@ def trainers_generalizedadditivemodelregressor( obj=disk_transpose, none_acceptable=True, is_of_type=bool) - if max_bins is not None: - inputs['MaxBins'] = try_set( - obj=max_bins, + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) - if max_output is not None: - inputs['MaxOutput'] = try_set( - obj=max_output, + if maximum_tree_output is not None: + inputs['MaximumTreeOutput'] = try_set( + obj=maximum_tree_output, none_acceptable=True, is_of_type=numbers.Real) if get_derivatives_sample_rate is not None: @@ -178,9 +177,9 @@ def trainers_generalizedadditivemodelregressor( obj=get_derivatives_sample_rate, none_acceptable=True, is_of_type=numbers.Real) - if rng_seed is not None: - inputs['RngSeed'] = try_set( - obj=rng_seed, + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, none_acceptable=True, is_of_type=numbers.Real) if feature_flocks is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_kmeansplusplusclusterer.py b/src/python/nimbusml/internal/entrypoints/trainers_kmeansplusplusclusterer.py index 417ebff4..b44dcd53 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_kmeansplusplusclusterer.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_kmeansplusplusclusterer.py @@ -12,15 +12,15 @@ def trainers_kmeansplusplusclusterer( training_data, predictor_model=None, - feature_column='Features', - weight_column=None, + feature_column_name='Features', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', k=5, - num_threads=None, - init_algorithm='KMeansParallel', + number_of_threads=None, + initialization_algorithm='KMeansYinyang', opt_tol=1e-07, - max_iterations=1000, + maximum_number_of_iterations=1000, accel_mem_budget_mb=4096, **params): """ @@ -32,19 +32,22 @@ def trainers_kmeansplusplusclusterer( the initial cluster centers. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param k: The number of clusters (inputs). - :param num_threads: Degree of lock-free parallelism. Defaults to - automatic. Determinism not guaranteed. (inputs). - :param init_algorithm: Cluster initialization algorithm (inputs). + :param number_of_threads: Degree of lock-free parallelism. + Defaults to automatic. Determinism not guaranteed. (inputs). + :param initialization_algorithm: Cluster initialization algorithm + (inputs). :param opt_tol: Tolerance parameter for trainer convergence. Low = slower, more accurate (inputs). - :param max_iterations: Maximum number of iterations. (inputs). + :param maximum_number_of_iterations: Maximum number of + iterations. (inputs). :param accel_mem_budget_mb: Memory budget (in MBs) to use for KMeans acceleration (inputs). :param predictor_model: The trained model (outputs). @@ -59,15 +62,15 @@ def trainers_kmeansplusplusclusterer( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -89,35 +92,34 @@ def trainers_kmeansplusplusclusterer( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if k is not None: inputs['K'] = try_set( obj=k, none_acceptable=True, is_of_type=numbers.Real) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if init_algorithm is not None: - inputs['InitAlgorithm'] = try_set( - obj=init_algorithm, + if initialization_algorithm is not None: + inputs['InitializationAlgorithm'] = try_set( + obj=initialization_algorithm, none_acceptable=True, is_of_type=str, values=[ 'KMeansPlusPlus', 'Random', - 'KMeansParallel']) + 'KMeansYinyang']) if opt_tol is not None: inputs['OptTol'] = try_set( obj=opt_tol, none_acceptable=True, is_of_type=numbers.Real) - if max_iterations is not None: - inputs['MaxIterations'] = try_set( - obj=max_iterations, + if maximum_number_of_iterations is not None: + inputs['MaximumNumberOfIterations'] = try_set( + obj=maximum_number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if accel_mem_budget_mb is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py index 91ea6061..5a54c69f 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py @@ -12,85 +12,91 @@ def trainers_lightgbmbinaryclassifier( training_data, predictor_model=None, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, - feature_column='Features', + number_of_leaves=None, + minimum_example_count_per_leaf=None, + feature_column_name='Features', booster=None, - label_column='Label', - weight_column=None, - group_id_column=None, + label_column_name='Label', + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + unbalanced_sets=False, + weight_of_positive_examples=1.0, + sigmoid=0.5, + evaluation_metric='Logloss', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + seed=None, parallel_trainer=None, **params): """ **Description** Train a LightGBM binary classification model. - :param num_boost_round: Number of iterations. (inputs). + :param number_of_iterations: Number of iterations. (inputs). :param training_data: The data to be used for training (inputs). :param learning_rate: Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1]. (inputs). - :param num_leaves: Maximum leaves for trees. (inputs). - :param min_data_per_leaf: Minimum number of instances needed in a - child. (inputs). - :param feature_column: Column to use for features (inputs). + :param number_of_leaves: Maximum leaves for trees. (inputs). + :param minimum_example_count_per_leaf: Minimum number of + instances needed in a child. (inputs). + :param feature_column_name: Column to use for features (inputs). :param booster: Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function. (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param max_bin: Max number of bucket bin for features. (inputs). - :param verbose_eval: Verbose (inputs). + :param unbalanced_sets: Use for binary classification when + training data is not balanced. (inputs). + :param weight_of_positive_examples: Control the balance of + positive and negative weights, useful for unbalanced classes. + A typical value to consider: sum(negative cases) / + sum(positive cases). (inputs). + :param sigmoid: Parameter for the sigmoid function. (inputs). + :param evaluation_metric: Evaluation metrics. (inputs). + :param maximum_bin_count_per_feature: Maximum number of bucket + bin for features. (inputs). + :param verbose: Verbose (inputs). :param silent: Printing running messages. (inputs). - :param n_thread: Number of parallel threads used to run LightGBM. - (inputs). - :param eval_metric: Evaluation metrics. (inputs). - :param use_softmax: Use softmax loss for the multi - classification. (inputs). + :param number_of_threads: Number of parallel threads used to run + LightGBM. (inputs). :param early_stopping_round: Rounds of early stopping, 0 will disable it. (inputs). - :param custom_gains: Comma seperated list of gains associated to - each relevance label. (inputs). - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. (inputs). :param batch_size: Number of entries in a batch when loading data. (inputs). - :param use_cat: Enable categorical split or not. (inputs). - :param use_missing: Enable missing value auto infer or not. + :param use_categorical_split: Enable categorical split or not. (inputs). - :param min_data_per_group: Min number of instances per - categorical group. (inputs). - :param max_cat_threshold: Max number of categorical thresholds. - (inputs). - :param cat_smooth: Lapalace smooth term in categorical feature - spilt. Avoid the bias of small categories. (inputs). - :param cat_l2: L2 Regularization for categorical split. (inputs). + :param handle_missing_value: Enable special handling of missing + value or not. (inputs). + :param minimum_example_count_per_group: Minimum number of + instances per categorical group. (inputs). + :param maximum_categorical_split_point_count: Max number of + categorical thresholds. (inputs). + :param categorical_smoothing: Lapalace smooth term in categorical + feature spilt. Avoid the bias of small categories. (inputs). + :param l2_categorical_regularization: L2 Regularization for + categorical split. (inputs). + :param seed: Sets the random seed for LightGBM to use. (inputs). :param parallel_trainer: Parallel LightGBM Learning Algorithm (inputs). :param predictor_model: The trained model (outputs). @@ -100,9 +106,9 @@ def trainers_lightgbmbinaryclassifier( inputs = {} outputs = {} - if num_boost_round is not None: - inputs['NumBoostRound'] = try_set( - obj=num_boost_round, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -115,19 +121,19 @@ def trainers_lightgbmbinaryclassifier( obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if min_data_per_leaf is not None: - inputs['MinDataPerLeaf'] = try_set( - obj=min_data_per_leaf, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -136,21 +142,21 @@ def trainers_lightgbmbinaryclassifier( obj=booster, none_acceptable=True, is_of_type=dict) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -172,105 +178,102 @@ def trainers_lightgbmbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if max_bin is not None: - inputs['MaxBin'] = try_set( - obj=max_bin, - none_acceptable=True, - is_of_type=numbers.Real) - if verbose_eval is not None: - inputs['VerboseEval'] = try_set( - obj=verbose_eval, + if unbalanced_sets is not None: + inputs['UnbalancedSets'] = try_set( + obj=unbalanced_sets, none_acceptable=True, is_of_type=bool) - if silent is not None: - inputs['Silent'] = try_set( - obj=silent, + if weight_of_positive_examples is not None: + inputs['WeightOfPositiveExamples'] = try_set( + obj=weight_of_positive_examples, none_acceptable=True, - is_of_type=bool) - if n_thread is not None: - inputs['NThread'] = try_set( - obj=n_thread, + is_of_type=numbers.Real) + if sigmoid is not None: + inputs['Sigmoid'] = try_set( + obj=sigmoid, none_acceptable=True, is_of_type=numbers.Real) - if eval_metric is not None: - inputs['EvalMetric'] = try_set( - obj=eval_metric, + if evaluation_metric is not None: + inputs['EvaluationMetric'] = try_set( + obj=evaluation_metric, none_acceptable=True, is_of_type=str, values=[ - 'DefaultMetric', - 'Rmse', - 'Mae', + 'None', + 'Default', 'Logloss', 'Error', - 'Merror', - 'Mlogloss', - 'Auc', - 'Ndcg', - 'Map']) - if use_softmax is not None: - inputs['UseSoftmax'] = try_set( - obj=use_softmax, + 'AreaUnderCurve']) + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, + none_acceptable=True, + is_of_type=numbers.Real) + if verbose is not None: + inputs['Verbose'] = try_set( + obj=verbose, + none_acceptable=True, + is_of_type=bool) + if silent is not None: + inputs['Silent'] = try_set( + obj=silent, none_acceptable=True, is_of_type=bool) + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, + none_acceptable=True, + is_of_type=numbers.Real) if early_stopping_round is not None: inputs['EarlyStoppingRound'] = try_set( obj=early_stopping_round, none_acceptable=True, is_of_type=numbers.Real) - if custom_gains is not None: - inputs['CustomGains'] = try_set( - obj=custom_gains, - none_acceptable=True, - is_of_type=str) - if sigmoid is not None: - inputs['Sigmoid'] = try_set( - obj=sigmoid, - none_acceptable=True, - is_of_type=numbers.Real) if batch_size is not None: inputs['BatchSize'] = try_set( obj=batch_size, none_acceptable=True, is_of_type=numbers.Real) - if use_cat is not None: - inputs['UseCat'] = try_set( - obj=use_cat, + if use_categorical_split is not None: + inputs['UseCategoricalSplit'] = try_set( + obj=use_categorical_split, none_acceptable=True, is_of_type=bool) + if handle_missing_value is not None: + inputs['HandleMissingValue'] = try_set( + obj=handle_missing_value, none_acceptable=True, is_of_type=bool) - if use_missing is not None: - inputs['UseMissing'] = try_set( - obj=use_missing, - none_acceptable=True, - is_of_type=bool) - if min_data_per_group is not None: - inputs['MinDataPerGroup'] = try_set( - obj=min_data_per_group, + if minimum_example_count_per_group is not None: + inputs['MinimumExampleCountPerGroup'] = try_set( + obj=minimum_example_count_per_group, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if max_cat_threshold is not None: - inputs['MaxCatThreshold'] = try_set( - obj=max_cat_threshold, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if cat_smooth is not None: - inputs['CatSmooth'] = try_set( - obj=cat_smooth, + if categorical_smoothing is not None: + inputs['CategoricalSmoothing'] = try_set( + obj=categorical_smoothing, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if cat_l2 is not None: - inputs['CatL2'] = try_set( - obj=cat_l2, + if l2_categorical_regularization is not None: + inputs['L2CategoricalRegularization'] = try_set( + obj=l2_categorical_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, + none_acceptable=True, + is_of_type=numbers.Real) if parallel_trainer is not None: inputs['ParallelTrainer'] = try_set( obj=parallel_trainer, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py index 968ff7e0..b1227046 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py @@ -12,85 +12,86 @@ def trainers_lightgbmclassifier( training_data, predictor_model=None, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, - feature_column='Features', + number_of_leaves=None, + minimum_example_count_per_leaf=None, + feature_column_name='Features', booster=None, - label_column='Label', - weight_column=None, - group_id_column=None, + label_column_name='Label', + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, - silent=True, - n_thread=None, - eval_metric='DefaultMetric', use_softmax=None, - early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', sigmoid=0.5, + evaluation_metric='Error', + maximum_bin_count_per_feature=255, + verbose=False, + silent=True, + number_of_threads=None, + early_stopping_round=0, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + seed=None, parallel_trainer=None, **params): """ **Description** Train a LightGBM multi class model. - :param num_boost_round: Number of iterations. (inputs). + :param number_of_iterations: Number of iterations. (inputs). :param training_data: The data to be used for training (inputs). :param learning_rate: Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1]. (inputs). - :param num_leaves: Maximum leaves for trees. (inputs). - :param min_data_per_leaf: Minimum number of instances needed in a - child. (inputs). - :param feature_column: Column to use for features (inputs). + :param number_of_leaves: Maximum leaves for trees. (inputs). + :param minimum_example_count_per_leaf: Minimum number of + instances needed in a child. (inputs). + :param feature_column_name: Column to use for features (inputs). :param booster: Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function. (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data - (inputs). - :param max_bin: Max number of bucket bin for features. (inputs). - :param verbose_eval: Verbose (inputs). - :param silent: Printing running messages. (inputs). - :param n_thread: Number of parallel threads used to run LightGBM. + :param caching: Whether trainer should cache input training data (inputs). - :param eval_metric: Evaluation metrics. (inputs). :param use_softmax: Use softmax loss for the multi classification. (inputs). + :param sigmoid: Parameter for the sigmoid function. (inputs). + :param evaluation_metric: Evaluation metrics. (inputs). + :param maximum_bin_count_per_feature: Maximum number of bucket + bin for features. (inputs). + :param verbose: Verbose (inputs). + :param silent: Printing running messages. (inputs). + :param number_of_threads: Number of parallel threads used to run + LightGBM. (inputs). :param early_stopping_round: Rounds of early stopping, 0 will disable it. (inputs). - :param custom_gains: Comma seperated list of gains associated to - each relevance label. (inputs). - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. (inputs). :param batch_size: Number of entries in a batch when loading data. (inputs). - :param use_cat: Enable categorical split or not. (inputs). - :param use_missing: Enable missing value auto infer or not. - (inputs). - :param min_data_per_group: Min number of instances per - categorical group. (inputs). - :param max_cat_threshold: Max number of categorical thresholds. + :param use_categorical_split: Enable categorical split or not. (inputs). - :param cat_smooth: Lapalace smooth term in categorical feature - spilt. Avoid the bias of small categories. (inputs). - :param cat_l2: L2 Regularization for categorical split. (inputs). + :param handle_missing_value: Enable special handling of missing + value or not. (inputs). + :param minimum_example_count_per_group: Minimum number of + instances per categorical group. (inputs). + :param maximum_categorical_split_point_count: Max number of + categorical thresholds. (inputs). + :param categorical_smoothing: Lapalace smooth term in categorical + feature spilt. Avoid the bias of small categories. (inputs). + :param l2_categorical_regularization: L2 Regularization for + categorical split. (inputs). + :param seed: Sets the random seed for LightGBM to use. (inputs). :param parallel_trainer: Parallel LightGBM Learning Algorithm (inputs). :param predictor_model: The trained model (outputs). @@ -100,9 +101,9 @@ def trainers_lightgbmclassifier( inputs = {} outputs = {} - if num_boost_round is not None: - inputs['NumBoostRound'] = try_set( - obj=num_boost_round, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -115,19 +116,19 @@ def trainers_lightgbmclassifier( obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if min_data_per_leaf is not None: - inputs['MinDataPerLeaf'] = try_set( - obj=min_data_per_leaf, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -136,21 +137,21 @@ def trainers_lightgbmclassifier( obj=booster, none_acceptable=True, is_of_type=dict) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -172,16 +173,35 @@ def trainers_lightgbmclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if max_bin is not None: - inputs['MaxBin'] = try_set( - obj=max_bin, + if use_softmax is not None: + inputs['UseSoftmax'] = try_set( + obj=use_softmax, + none_acceptable=True, + is_of_type=bool) + if sigmoid is not None: + inputs['Sigmoid'] = try_set( + obj=sigmoid, + none_acceptable=True, + is_of_type=numbers.Real) + if evaluation_metric is not None: + inputs['EvaluationMetric'] = try_set( + obj=evaluation_metric, + none_acceptable=True, + is_of_type=str, + values=[ + 'None', + 'Default', + 'Error', + 'LogLoss']) + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) - if verbose_eval is not None: - inputs['VerboseEval'] = try_set( - obj=verbose_eval, + if verbose is not None: + inputs['Verbose'] = try_set( + obj=verbose, none_acceptable=True, is_of_type=bool) if silent is not None: @@ -189,88 +209,60 @@ def trainers_lightgbmclassifier( obj=silent, none_acceptable=True, is_of_type=bool) - if n_thread is not None: - inputs['NThread'] = try_set( - obj=n_thread, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if eval_metric is not None: - inputs['EvalMetric'] = try_set( - obj=eval_metric, - none_acceptable=True, - is_of_type=str, - values=[ - 'DefaultMetric', - 'Rmse', - 'Mae', - 'Logloss', - 'Error', - 'Merror', - 'Mlogloss', - 'Auc', - 'Ndcg', - 'Map']) - if use_softmax is not None: - inputs['UseSoftmax'] = try_set( - obj=use_softmax, - none_acceptable=True, - is_of_type=bool) if early_stopping_round is not None: inputs['EarlyStoppingRound'] = try_set( obj=early_stopping_round, none_acceptable=True, is_of_type=numbers.Real) - if custom_gains is not None: - inputs['CustomGains'] = try_set( - obj=custom_gains, - none_acceptable=True, - is_of_type=str) - if sigmoid is not None: - inputs['Sigmoid'] = try_set( - obj=sigmoid, - none_acceptable=True, - is_of_type=numbers.Real) if batch_size is not None: inputs['BatchSize'] = try_set( obj=batch_size, none_acceptable=True, is_of_type=numbers.Real) - if use_cat is not None: - inputs['UseCat'] = try_set( - obj=use_cat, - none_acceptable=True, - is_of_type=bool) - if use_missing is not None: - inputs['UseMissing'] = try_set( - obj=use_missing, + if use_categorical_split is not None: + inputs['UseCategoricalSplit'] = try_set( + obj=use_categorical_split, none_acceptable=True, is_of_type=bool) + if handle_missing_value is not None: + inputs['HandleMissingValue'] = try_set( + obj=handle_missing_value, none_acceptable=True, is_of_type=bool) - if min_data_per_group is not None: - inputs['MinDataPerGroup'] = try_set( - obj=min_data_per_group, + if minimum_example_count_per_group is not None: + inputs['MinimumExampleCountPerGroup'] = try_set( + obj=minimum_example_count_per_group, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if max_cat_threshold is not None: - inputs['MaxCatThreshold'] = try_set( - obj=max_cat_threshold, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if cat_smooth is not None: - inputs['CatSmooth'] = try_set( - obj=cat_smooth, + if categorical_smoothing is not None: + inputs['CategoricalSmoothing'] = try_set( + obj=categorical_smoothing, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if cat_l2 is not None: - inputs['CatL2'] = try_set( - obj=cat_l2, + if l2_categorical_regularization is not None: + inputs['L2CategoricalRegularization'] = try_set( + obj=l2_categorical_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, + none_acceptable=True, + is_of_type=numbers.Real) if parallel_trainer is not None: inputs['ParallelTrainer'] = try_set( obj=parallel_trainer, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py index 115423cf..5a3a44fd 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py @@ -12,85 +12,86 @@ def trainers_lightgbmranker( training_data, predictor_model=None, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, - feature_column='Features', + number_of_leaves=None, + minimum_example_count_per_leaf=None, + feature_column_name='Features', booster=None, - label_column='Label', - weight_column=None, - group_id_column=None, + label_column_name='Label', + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + custom_gains=[0, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095], + sigmoid=0.5, + evaluation_metric='NormalizedDiscountedCumulativeGain', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + seed=None, parallel_trainer=None, **params): """ **Description** Train a LightGBM ranking model. - :param num_boost_round: Number of iterations. (inputs). + :param number_of_iterations: Number of iterations. (inputs). :param training_data: The data to be used for training (inputs). :param learning_rate: Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1]. (inputs). - :param num_leaves: Maximum leaves for trees. (inputs). - :param min_data_per_leaf: Minimum number of instances needed in a - child. (inputs). - :param feature_column: Column to use for features (inputs). + :param number_of_leaves: Maximum leaves for trees. (inputs). + :param minimum_example_count_per_leaf: Minimum number of + instances needed in a child. (inputs). + :param feature_column_name: Column to use for features (inputs). :param booster: Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function. (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param max_bin: Max number of bucket bin for features. (inputs). - :param verbose_eval: Verbose (inputs). + :param custom_gains: An array of gains associated to each + relevance label. (inputs). + :param sigmoid: Parameter for the sigmoid function. (inputs). + :param evaluation_metric: Evaluation metrics. (inputs). + :param maximum_bin_count_per_feature: Maximum number of bucket + bin for features. (inputs). + :param verbose: Verbose (inputs). :param silent: Printing running messages. (inputs). - :param n_thread: Number of parallel threads used to run LightGBM. - (inputs). - :param eval_metric: Evaluation metrics. (inputs). - :param use_softmax: Use softmax loss for the multi - classification. (inputs). + :param number_of_threads: Number of parallel threads used to run + LightGBM. (inputs). :param early_stopping_round: Rounds of early stopping, 0 will disable it. (inputs). - :param custom_gains: Comma seperated list of gains associated to - each relevance label. (inputs). - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. (inputs). :param batch_size: Number of entries in a batch when loading data. (inputs). - :param use_cat: Enable categorical split or not. (inputs). - :param use_missing: Enable missing value auto infer or not. + :param use_categorical_split: Enable categorical split or not. (inputs). - :param min_data_per_group: Min number of instances per - categorical group. (inputs). - :param max_cat_threshold: Max number of categorical thresholds. - (inputs). - :param cat_smooth: Lapalace smooth term in categorical feature - spilt. Avoid the bias of small categories. (inputs). - :param cat_l2: L2 Regularization for categorical split. (inputs). + :param handle_missing_value: Enable special handling of missing + value or not. (inputs). + :param minimum_example_count_per_group: Minimum number of + instances per categorical group. (inputs). + :param maximum_categorical_split_point_count: Max number of + categorical thresholds. (inputs). + :param categorical_smoothing: Lapalace smooth term in categorical + feature spilt. Avoid the bias of small categories. (inputs). + :param l2_categorical_regularization: L2 Regularization for + categorical split. (inputs). + :param seed: Sets the random seed for LightGBM to use. (inputs). :param parallel_trainer: Parallel LightGBM Learning Algorithm (inputs). :param predictor_model: The trained model (outputs). @@ -100,9 +101,9 @@ def trainers_lightgbmranker( inputs = {} outputs = {} - if num_boost_round is not None: - inputs['NumBoostRound'] = try_set( - obj=num_boost_round, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -115,19 +116,19 @@ def trainers_lightgbmranker( obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if min_data_per_leaf is not None: - inputs['MinDataPerLeaf'] = try_set( - obj=min_data_per_leaf, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -136,21 +137,21 @@ def trainers_lightgbmranker( obj=booster, none_acceptable=True, is_of_type=dict) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -172,16 +173,35 @@ def trainers_lightgbmranker( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if max_bin is not None: - inputs['MaxBin'] = try_set( - obj=max_bin, + if custom_gains is not None: + inputs['CustomGains'] = try_set( + obj=custom_gains, + none_acceptable=True, + is_of_type=list) + if sigmoid is not None: + inputs['Sigmoid'] = try_set( + obj=sigmoid, none_acceptable=True, is_of_type=numbers.Real) - if verbose_eval is not None: - inputs['VerboseEval'] = try_set( - obj=verbose_eval, + if evaluation_metric is not None: + inputs['EvaluationMetric'] = try_set( + obj=evaluation_metric, + none_acceptable=True, + is_of_type=str, + values=[ + 'None', + 'Default', + 'MeanAveragedPrecision', + 'NormalizedDiscountedCumulativeGain']) + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, + none_acceptable=True, + is_of_type=numbers.Real) + if verbose is not None: + inputs['Verbose'] = try_set( + obj=verbose, none_acceptable=True, is_of_type=bool) if silent is not None: @@ -189,88 +209,60 @@ def trainers_lightgbmranker( obj=silent, none_acceptable=True, is_of_type=bool) - if n_thread is not None: - inputs['NThread'] = try_set( - obj=n_thread, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if eval_metric is not None: - inputs['EvalMetric'] = try_set( - obj=eval_metric, - none_acceptable=True, - is_of_type=str, - values=[ - 'DefaultMetric', - 'Rmse', - 'Mae', - 'Logloss', - 'Error', - 'Merror', - 'Mlogloss', - 'Auc', - 'Ndcg', - 'Map']) - if use_softmax is not None: - inputs['UseSoftmax'] = try_set( - obj=use_softmax, - none_acceptable=True, - is_of_type=bool) if early_stopping_round is not None: inputs['EarlyStoppingRound'] = try_set( obj=early_stopping_round, none_acceptable=True, is_of_type=numbers.Real) - if custom_gains is not None: - inputs['CustomGains'] = try_set( - obj=custom_gains, - none_acceptable=True, - is_of_type=str) - if sigmoid is not None: - inputs['Sigmoid'] = try_set( - obj=sigmoid, - none_acceptable=True, - is_of_type=numbers.Real) if batch_size is not None: inputs['BatchSize'] = try_set( obj=batch_size, none_acceptable=True, is_of_type=numbers.Real) - if use_cat is not None: - inputs['UseCat'] = try_set( - obj=use_cat, + if use_categorical_split is not None: + inputs['UseCategoricalSplit'] = try_set( + obj=use_categorical_split, none_acceptable=True, is_of_type=bool) + if handle_missing_value is not None: + inputs['HandleMissingValue'] = try_set( + obj=handle_missing_value, none_acceptable=True, is_of_type=bool) - if use_missing is not None: - inputs['UseMissing'] = try_set( - obj=use_missing, - none_acceptable=True, - is_of_type=bool) - if min_data_per_group is not None: - inputs['MinDataPerGroup'] = try_set( - obj=min_data_per_group, + if minimum_example_count_per_group is not None: + inputs['MinimumExampleCountPerGroup'] = try_set( + obj=minimum_example_count_per_group, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if max_cat_threshold is not None: - inputs['MaxCatThreshold'] = try_set( - obj=max_cat_threshold, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if cat_smooth is not None: - inputs['CatSmooth'] = try_set( - obj=cat_smooth, + if categorical_smoothing is not None: + inputs['CategoricalSmoothing'] = try_set( + obj=categorical_smoothing, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if cat_l2 is not None: - inputs['CatL2'] = try_set( - obj=cat_l2, + if l2_categorical_regularization is not None: + inputs['L2CategoricalRegularization'] = try_set( + obj=l2_categorical_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, + none_acceptable=True, + is_of_type=numbers.Real) if parallel_trainer is not None: inputs['ParallelTrainer'] = try_set( obj=parallel_trainer, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py index 79d3c310..32260ebe 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py @@ -12,85 +12,81 @@ def trainers_lightgbmregressor( training_data, predictor_model=None, - num_boost_round=100, + number_of_iterations=100, learning_rate=None, - num_leaves=None, - min_data_per_leaf=None, - feature_column='Features', + number_of_leaves=None, + minimum_example_count_per_leaf=None, + feature_column_name='Features', booster=None, - label_column='Label', - weight_column=None, - group_id_column=None, + label_column_name='Label', + example_weight_column_name=None, + row_group_column_name=None, normalize_features='Auto', caching='Auto', - max_bin=255, - verbose_eval=False, + evaluation_metric='RootMeanSquaredError', + maximum_bin_count_per_feature=255, + verbose=False, silent=True, - n_thread=None, - eval_metric='DefaultMetric', - use_softmax=None, + number_of_threads=None, early_stopping_round=0, - custom_gains='0,3,7,15,31,63,127,255,511,1023,2047,4095', - sigmoid=0.5, batch_size=1048576, - use_cat=None, - use_missing=False, - min_data_per_group=100, - max_cat_threshold=32, - cat_smooth=10.0, - cat_l2=10.0, + use_categorical_split=None, + handle_missing_value=True, + minimum_example_count_per_group=100, + maximum_categorical_split_point_count=32, + categorical_smoothing=10.0, + l2_categorical_regularization=10.0, + seed=None, parallel_trainer=None, **params): """ **Description** LightGBM Regression - :param num_boost_round: Number of iterations. (inputs). + :param number_of_iterations: Number of iterations. (inputs). :param training_data: The data to be used for training (inputs). :param learning_rate: Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1]. (inputs). - :param num_leaves: Maximum leaves for trees. (inputs). - :param min_data_per_leaf: Minimum number of instances needed in a - child. (inputs). - :param feature_column: Column to use for features (inputs). + :param number_of_leaves: Maximum leaves for trees. (inputs). + :param minimum_example_count_per_leaf: Minimum number of + instances needed in a child. (inputs). + :param feature_column_name: Column to use for features (inputs). :param booster: Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function. (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). - :param group_id_column: Column to use for example groupId + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param row_group_column_name: Column to use for example groupId (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param max_bin: Max number of bucket bin for features. (inputs). - :param verbose_eval: Verbose (inputs). + :param evaluation_metric: Evaluation metrics. (inputs). + :param maximum_bin_count_per_feature: Maximum number of bucket + bin for features. (inputs). + :param verbose: Verbose (inputs). :param silent: Printing running messages. (inputs). - :param n_thread: Number of parallel threads used to run LightGBM. - (inputs). - :param eval_metric: Evaluation metrics. (inputs). - :param use_softmax: Use softmax loss for the multi - classification. (inputs). + :param number_of_threads: Number of parallel threads used to run + LightGBM. (inputs). :param early_stopping_round: Rounds of early stopping, 0 will disable it. (inputs). - :param custom_gains: Comma seperated list of gains associated to - each relevance label. (inputs). - :param sigmoid: Parameter for the sigmoid function. Used only in - LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in - LightGbmRankingTrainer. (inputs). :param batch_size: Number of entries in a batch when loading data. (inputs). - :param use_cat: Enable categorical split or not. (inputs). - :param use_missing: Enable missing value auto infer or not. - (inputs). - :param min_data_per_group: Min number of instances per - categorical group. (inputs). - :param max_cat_threshold: Max number of categorical thresholds. + :param use_categorical_split: Enable categorical split or not. (inputs). - :param cat_smooth: Lapalace smooth term in categorical feature - spilt. Avoid the bias of small categories. (inputs). - :param cat_l2: L2 Regularization for categorical split. (inputs). + :param handle_missing_value: Enable special handling of missing + value or not. (inputs). + :param minimum_example_count_per_group: Minimum number of + instances per categorical group. (inputs). + :param maximum_categorical_split_point_count: Max number of + categorical thresholds. (inputs). + :param categorical_smoothing: Lapalace smooth term in categorical + feature spilt. Avoid the bias of small categories. (inputs). + :param l2_categorical_regularization: L2 Regularization for + categorical split. (inputs). + :param seed: Sets the random seed for LightGBM to use. (inputs). :param parallel_trainer: Parallel LightGBM Learning Algorithm (inputs). :param predictor_model: The trained model (outputs). @@ -100,9 +96,9 @@ def trainers_lightgbmregressor( inputs = {} outputs = {} - if num_boost_round is not None: - inputs['NumBoostRound'] = try_set( - obj=num_boost_round, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -115,19 +111,19 @@ def trainers_lightgbmregressor( obj=learning_rate, none_acceptable=True, is_of_type=numbers.Real) - if num_leaves is not None: - inputs['NumLeaves'] = try_set( - obj=num_leaves, + if number_of_leaves is not None: + inputs['NumberOfLeaves'] = try_set( + obj=number_of_leaves, none_acceptable=True, is_of_type=numbers.Real) - if min_data_per_leaf is not None: - inputs['MinDataPerLeaf'] = try_set( - obj=min_data_per_leaf, + if minimum_example_count_per_leaf is not None: + inputs['MinimumExampleCountPerLeaf'] = try_set( + obj=minimum_example_count_per_leaf, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -136,21 +132,21 @@ def trainers_lightgbmregressor( obj=booster, none_acceptable=True, is_of_type=dict) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if group_id_column is not None: - inputs['GroupIdColumn'] = try_set( - obj=group_id_column, + if row_group_column_name is not None: + inputs['RowGroupColumnName'] = try_set( + obj=row_group_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -172,16 +168,26 @@ def trainers_lightgbmregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if max_bin is not None: - inputs['MaxBin'] = try_set( - obj=max_bin, + if evaluation_metric is not None: + inputs['EvaluationMetric'] = try_set( + obj=evaluation_metric, + none_acceptable=True, + is_of_type=str, + values=[ + 'None', + 'Default', + 'MeanAbsoluteError', + 'RootMeanSquaredError', + 'MeanSquaredError']) + if maximum_bin_count_per_feature is not None: + inputs['MaximumBinCountPerFeature'] = try_set( + obj=maximum_bin_count_per_feature, none_acceptable=True, is_of_type=numbers.Real) - if verbose_eval is not None: - inputs['VerboseEval'] = try_set( - obj=verbose_eval, + if verbose is not None: + inputs['Verbose'] = try_set( + obj=verbose, none_acceptable=True, is_of_type=bool) if silent is not None: @@ -189,88 +195,60 @@ def trainers_lightgbmregressor( obj=silent, none_acceptable=True, is_of_type=bool) - if n_thread is not None: - inputs['NThread'] = try_set( - obj=n_thread, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) - if eval_metric is not None: - inputs['EvalMetric'] = try_set( - obj=eval_metric, - none_acceptable=True, - is_of_type=str, - values=[ - 'DefaultMetric', - 'Rmse', - 'Mae', - 'Logloss', - 'Error', - 'Merror', - 'Mlogloss', - 'Auc', - 'Ndcg', - 'Map']) - if use_softmax is not None: - inputs['UseSoftmax'] = try_set( - obj=use_softmax, - none_acceptable=True, - is_of_type=bool) if early_stopping_round is not None: inputs['EarlyStoppingRound'] = try_set( obj=early_stopping_round, none_acceptable=True, is_of_type=numbers.Real) - if custom_gains is not None: - inputs['CustomGains'] = try_set( - obj=custom_gains, - none_acceptable=True, - is_of_type=str) - if sigmoid is not None: - inputs['Sigmoid'] = try_set( - obj=sigmoid, - none_acceptable=True, - is_of_type=numbers.Real) if batch_size is not None: inputs['BatchSize'] = try_set( obj=batch_size, none_acceptable=True, is_of_type=numbers.Real) - if use_cat is not None: - inputs['UseCat'] = try_set( - obj=use_cat, + if use_categorical_split is not None: + inputs['UseCategoricalSplit'] = try_set( + obj=use_categorical_split, none_acceptable=True, is_of_type=bool) + if handle_missing_value is not None: + inputs['HandleMissingValue'] = try_set( + obj=handle_missing_value, none_acceptable=True, is_of_type=bool) - if use_missing is not None: - inputs['UseMissing'] = try_set( - obj=use_missing, - none_acceptable=True, - is_of_type=bool) - if min_data_per_group is not None: - inputs['MinDataPerGroup'] = try_set( - obj=min_data_per_group, + if minimum_example_count_per_group is not None: + inputs['MinimumExampleCountPerGroup'] = try_set( + obj=minimum_example_count_per_group, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if max_cat_threshold is not None: - inputs['MaxCatThreshold'] = try_set( - obj=max_cat_threshold, + if maximum_categorical_split_point_count is not None: + inputs['MaximumCategoricalSplitPointCount'] = try_set( + obj=maximum_categorical_split_point_count, none_acceptable=True, is_of_type=numbers.Real, valid_range={ 'Inf': 0, 'Max': 2147483647}) - if cat_smooth is not None: - inputs['CatSmooth'] = try_set( - obj=cat_smooth, + if categorical_smoothing is not None: + inputs['CategoricalSmoothing'] = try_set( + obj=categorical_smoothing, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) - if cat_l2 is not None: - inputs['CatL2'] = try_set( - obj=cat_l2, + if l2_categorical_regularization is not None: + inputs['L2CategoricalRegularization'] = try_set( + obj=l2_categorical_regularization, none_acceptable=True, is_of_type=numbers.Real, valid_range={'Min': 0.0}) + if seed is not None: + inputs['Seed'] = try_set( + obj=seed, + none_acceptable=True, + is_of_type=numbers.Real) if parallel_trainer is not None: inputs['ParallelTrainer'] = try_set( obj=parallel_trainer, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_linearsvmbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_linearsvmbinaryclassifier.py index 691f4ac6..c165f8e6 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_linearsvmbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_linearsvmbinaryclassifier.py @@ -12,20 +12,20 @@ def trainers_linearsvmbinaryclassifier( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', lambda_=0.001, perform_projection=False, - num_iterations=1, - init_wts_diameter=0.0, + number_of_iterations=1, + initial_weights_diameter=0.0, no_bias=False, calibrator=None, max_calibration_examples=1000000, initial_weights=None, shuffle=True, - streaming_cache_size=1000000, batch_size=1, **params): """ @@ -33,17 +33,19 @@ def trainers_linearsvmbinaryclassifier( Train a linear SVM. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param lambda_: Regularizer constant (inputs). :param perform_projection: Perform projection to unit-ball? Typically used with batch size > 1. (inputs). - :param num_iterations: Number of iterations (inputs). - :param init_wts_diameter: Init weights diameter (inputs). + :param number_of_iterations: Number of iterations (inputs). + :param initial_weights_diameter: Init weights diameter (inputs). :param no_bias: No bias (inputs). :param calibrator: The calibrator kind to apply to the predictor. Specify null for no calibration (inputs). @@ -53,8 +55,6 @@ def trainers_linearsvmbinaryclassifier( (inputs). :param shuffle: Whether to shuffle for each training iteration (inputs). - :param streaming_cache_size: Size of cache when trained in Scope - (inputs). :param batch_size: Batch size (inputs). :param predictor_model: The trained model (outputs). """ @@ -68,15 +68,21 @@ def trainers_linearsvmbinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -98,7 +104,6 @@ def trainers_linearsvmbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if lambda_ is not None: inputs['Lambda'] = try_set( @@ -108,14 +113,14 @@ def trainers_linearsvmbinaryclassifier( if perform_projection is not None: inputs['PerformProjection'] = try_set( obj=perform_projection, none_acceptable=True, is_of_type=bool) - if num_iterations is not None: - inputs['NumIterations'] = try_set( - obj=num_iterations, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) - if init_wts_diameter is not None: - inputs['InitWtsDiameter'] = try_set( - obj=init_wts_diameter, + if initial_weights_diameter is not None: + inputs['InitialWeightsDiameter'] = try_set( + obj=initial_weights_diameter, none_acceptable=True, is_of_type=numbers.Real) if no_bias is not None: @@ -143,11 +148,6 @@ def trainers_linearsvmbinaryclassifier( obj=shuffle, none_acceptable=True, is_of_type=bool) - if streaming_cache_size is not None: - inputs['StreamingCacheSize'] = try_set( - obj=streaming_cache_size, - none_acceptable=True, - is_of_type=numbers.Real) if batch_size is not None: inputs['BatchSize'] = try_set( obj=batch_size, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionbinaryclassifier.py index ffef3791..5f89639b 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionbinaryclassifier.py @@ -12,23 +12,23 @@ def trainers_logisticregressionbinaryclassifier( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', - weight_column=None, + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', - show_training_stats=False, - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + show_training_statistics=False, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_initialization_tolerance=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - num_threads=None, + number_of_threads=None, dense_optimizer=False, **params): """ @@ -40,32 +40,36 @@ def trainers_logisticregressionbinaryclassifier( logistical function. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param show_training_stats: Show statistics of training examples. + :param show_training_statistics: Show statistics of training + examples. (inputs). + :param l2_regularization: L2 regularization weight (inputs). + :param l1_regularization: L1 regularization weight (inputs). + :param optimization_tolerance: Tolerance parameter for + optimization convergence. Low = slower, more accurate (inputs). - :param l2_weight: L2 regularization weight (inputs). - :param l1_weight: L1 regularization weight (inputs). - :param opt_tol: Tolerance parameter for optimization convergence. - Low = slower, more accurate (inputs). - :param memory_size: Memory size for L-BFGS. Low=faster, less + :param history_size: Memory size for L-BFGS. Low=faster, less accurate (inputs). :param enforce_non_negativity: Enforce non-negative weights (inputs). - :param init_wts_diameter: Init weights diameter (inputs). - :param max_iterations: Maximum iterations. (inputs). - :param sgd_initialization_tolerance: Run SGD to initialize LR - weights, converging to this tolerance (inputs). + :param initial_weights_diameter: Init weights diameter (inputs). + :param maximum_number_of_iterations: Maximum iterations. + (inputs). + :param stochastic_gradient_descent_initilaization_tolerance: Run + SGD to initialize LR weights, converging to this tolerance + (inputs). :param quiet: If set to true, produce no output during training. (inputs). :param use_threads: Whether or not to use threads. Default is true (inputs). - :param num_threads: Number of threads (inputs). + :param number_of_threads: Number of threads (inputs). :param dense_optimizer: Force densification of the internal optimization vectors (inputs). :param predictor_model: The trained model (outputs). @@ -80,21 +84,21 @@ def trainers_logisticregressionbinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -116,49 +120,48 @@ def trainers_logisticregressionbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if show_training_stats is not None: - inputs['ShowTrainingStats'] = try_set( - obj=show_training_stats, + if show_training_statistics is not None: + inputs['ShowTrainingStatistics'] = try_set( + obj=show_training_statistics, none_acceptable=True, is_of_type=bool) - if l2_weight is not None: - inputs['L2Weight'] = try_set( - obj=l2_weight, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) - if l1_weight is not None: - inputs['L1Weight'] = try_set( - obj=l1_weight, + if l1_regularization is not None: + inputs['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real) - if opt_tol is not None: - inputs['OptTol'] = try_set( - obj=opt_tol, + if optimization_tolerance is not None: + inputs['OptimizationTolerance'] = try_set( + obj=optimization_tolerance, none_acceptable=True, is_of_type=numbers.Real) - if memory_size is not None: - inputs['MemorySize'] = try_set( - obj=memory_size, + if history_size is not None: + inputs['HistorySize'] = try_set( + obj=history_size, none_acceptable=True, is_of_type=numbers.Real) if enforce_non_negativity is not None: inputs['EnforceNonNegativity'] = try_set( obj=enforce_non_negativity, none_acceptable=True, is_of_type=bool) - if init_wts_diameter is not None: - inputs['InitWtsDiameter'] = try_set( - obj=init_wts_diameter, + if initial_weights_diameter is not None: + inputs['InitialWeightsDiameter'] = try_set( + obj=initial_weights_diameter, none_acceptable=True, is_of_type=numbers.Real) - if max_iterations is not None: - inputs['MaxIterations'] = try_set( - obj=max_iterations, + if maximum_number_of_iterations is not None: + inputs['MaximumNumberOfIterations'] = try_set( + obj=maximum_number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) - if sgd_initialization_tolerance is not None: - inputs['SgdInitializationTolerance'] = try_set( - obj=sgd_initialization_tolerance, + if stochastic_gradient_descent_initilaization_tolerance is not None: + inputs['StochasticGradientDescentInitilaizationTolerance'] = try_set( + obj=stochastic_gradient_descent_initilaization_tolerance, none_acceptable=True, is_of_type=numbers.Real) if quiet is not None: @@ -171,9 +174,9 @@ def trainers_logisticregressionbinaryclassifier( obj=use_threads, none_acceptable=True, is_of_type=bool) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) if dense_optimizer is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py index eca935f1..5db498b1 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py @@ -12,60 +12,63 @@ def trainers_logisticregressionclassifier( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', - weight_column=None, + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', - show_training_stats=False, - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + show_training_statistics=False, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_initialization_tolerance=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - num_threads=None, + number_of_threads=None, dense_optimizer=False, **params): """ **Description** - Logistic Regression is a method in statistics used to predict the - probability of occurrence of an event and can be used as a - classification algorithm. The algorithm predicts the - probability of occurrence of an event by fitting data to a - logistical function. + Maximum entrypy classification is a method in statistics used to + predict the probabilities of parallel events. The model + predicts the probabilities of parallel events by fitting data + to a softmax function. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param show_training_stats: Show statistics of training examples. + :param show_training_statistics: Show statistics of training + examples. (inputs). + :param l2_regularization: L2 regularization weight (inputs). + :param l1_regularization: L1 regularization weight (inputs). + :param optimization_tolerance: Tolerance parameter for + optimization convergence. Low = slower, more accurate (inputs). - :param l2_weight: L2 regularization weight (inputs). - :param l1_weight: L1 regularization weight (inputs). - :param opt_tol: Tolerance parameter for optimization convergence. - Low = slower, more accurate (inputs). - :param memory_size: Memory size for L-BFGS. Low=faster, less + :param history_size: Memory size for L-BFGS. Low=faster, less accurate (inputs). :param enforce_non_negativity: Enforce non-negative weights (inputs). - :param init_wts_diameter: Init weights diameter (inputs). - :param max_iterations: Maximum iterations. (inputs). - :param sgd_initialization_tolerance: Run SGD to initialize LR - weights, converging to this tolerance (inputs). + :param initial_weights_diameter: Init weights diameter (inputs). + :param maximum_number_of_iterations: Maximum iterations. + (inputs). + :param stochastic_gradient_descent_initilaization_tolerance: Run + SGD to initialize LR weights, converging to this tolerance + (inputs). :param quiet: If set to true, produce no output during training. (inputs). :param use_threads: Whether or not to use threads. Default is true (inputs). - :param num_threads: Number of threads (inputs). + :param number_of_threads: Number of threads (inputs). :param dense_optimizer: Force densification of the internal optimization vectors (inputs). :param predictor_model: The trained model (outputs). @@ -80,21 +83,21 @@ def trainers_logisticregressionclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -116,49 +119,48 @@ def trainers_logisticregressionclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if show_training_stats is not None: - inputs['ShowTrainingStats'] = try_set( - obj=show_training_stats, + if show_training_statistics is not None: + inputs['ShowTrainingStatistics'] = try_set( + obj=show_training_statistics, none_acceptable=True, is_of_type=bool) - if l2_weight is not None: - inputs['L2Weight'] = try_set( - obj=l2_weight, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) - if l1_weight is not None: - inputs['L1Weight'] = try_set( - obj=l1_weight, + if l1_regularization is not None: + inputs['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real) - if opt_tol is not None: - inputs['OptTol'] = try_set( - obj=opt_tol, + if optimization_tolerance is not None: + inputs['OptimizationTolerance'] = try_set( + obj=optimization_tolerance, none_acceptable=True, is_of_type=numbers.Real) - if memory_size is not None: - inputs['MemorySize'] = try_set( - obj=memory_size, + if history_size is not None: + inputs['HistorySize'] = try_set( + obj=history_size, none_acceptable=True, is_of_type=numbers.Real) if enforce_non_negativity is not None: inputs['EnforceNonNegativity'] = try_set( obj=enforce_non_negativity, none_acceptable=True, is_of_type=bool) - if init_wts_diameter is not None: - inputs['InitWtsDiameter'] = try_set( - obj=init_wts_diameter, + if initial_weights_diameter is not None: + inputs['InitialWeightsDiameter'] = try_set( + obj=initial_weights_diameter, none_acceptable=True, is_of_type=numbers.Real) - if max_iterations is not None: - inputs['MaxIterations'] = try_set( - obj=max_iterations, + if maximum_number_of_iterations is not None: + inputs['MaximumNumberOfIterations'] = try_set( + obj=maximum_number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) - if sgd_initialization_tolerance is not None: - inputs['SgdInitializationTolerance'] = try_set( - obj=sgd_initialization_tolerance, + if stochastic_gradient_descent_initilaization_tolerance is not None: + inputs['StochasticGradientDescentInitilaizationTolerance'] = try_set( + obj=stochastic_gradient_descent_initilaization_tolerance, none_acceptable=True, is_of_type=numbers.Real) if quiet is not None: @@ -171,9 +173,9 @@ def trainers_logisticregressionclassifier( obj=use_threads, none_acceptable=True, is_of_type=bool) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) if dense_optimizer is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_naivebayesclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_naivebayesclassifier.py index 2407940f..548cc4aa 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_naivebayesclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_naivebayesclassifier.py @@ -11,21 +11,21 @@ def trainers_naivebayesclassifier( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', + feature_column_name='Features', + label_column_name='Label', normalize_features='Auto', caching='Auto', **params): """ **Description** - Train a MultiClassNaiveBayesTrainer. + Train a MulticlassNaiveBayesTrainer. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param predictor_model: The trained model (outputs). """ @@ -39,15 +39,15 @@ def trainers_naivebayesclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -69,7 +69,6 @@ def trainers_naivebayesclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if predictor_model is not None: outputs['PredictorModel'] = try_set( diff --git a/src/python/nimbusml/internal/entrypoints/trainers_onlinegradientdescentregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_onlinegradientdescentregressor.py index bd49918c..855fe965 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_onlinegradientdescentregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_onlinegradientdescentregressor.py @@ -12,51 +12,50 @@ def trainers_onlinegradientdescentregressor( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', + feature_column_name='Features', + label_column_name='Label', normalize_features='Auto', caching='Auto', loss_function=None, learning_rate=0.1, decrease_learning_rate=True, - l2_regularizer_weight=0.0, - num_iterations=1, - init_wts_diameter=0.0, + l2_regularization=0.0, + number_of_iterations=1, + initial_weights_diameter=0.0, reset_weights_after_x_examples=None, - do_lazy_updates=True, + lazy_update=True, recency_gain=0.0, - recency_gain_multi=False, + recency_gain_multiplicative=False, averaged=True, averaged_tolerance=0.01, initial_weights=None, shuffle=True, - streaming_cache_size=1000000, **params): """ **Description** Train a Online gradient descent perceptron. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param loss_function: Loss Function (inputs). :param learning_rate: Learning rate (inputs). :param decrease_learning_rate: Decrease learning rate (inputs). - :param l2_regularizer_weight: L2 Regularization Weight (inputs). - :param num_iterations: Number of iterations (inputs). - :param init_wts_diameter: Init weights diameter (inputs). + :param l2_regularization: L2 Regularization Weight (inputs). + :param number_of_iterations: Number of iterations (inputs). + :param initial_weights_diameter: Init weights diameter (inputs). :param reset_weights_after_x_examples: Number of examples after which weights will be reset to the current average (inputs). - :param do_lazy_updates: Instead of updating averaged weights on - every example, only update when loss is nonzero (inputs). + :param lazy_update: Instead of updating averaged weights on every + example, only update when loss is nonzero (inputs). :param recency_gain: Extra weight given to more recent updates (inputs). - :param recency_gain_multi: Whether Recency Gain is multiplicative - (vs. additive) (inputs). + :param recency_gain_multiplicative: Whether Recency Gain is + multiplicative (vs. additive) (inputs). :param averaged: Do averaging? (inputs). :param averaged_tolerance: The inexactness tolerance for averaging (inputs). @@ -64,8 +63,6 @@ def trainers_onlinegradientdescentregressor( (inputs). :param shuffle: Whether to shuffle for each training iteration (inputs). - :param streaming_cache_size: Size of cache when trained in Scope - (inputs). :param predictor_model: The trained model (outputs). """ @@ -78,15 +75,15 @@ def trainers_onlinegradientdescentregressor( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -108,7 +105,6 @@ def trainers_onlinegradientdescentregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if loss_function is not None: inputs['LossFunction'] = try_set( @@ -123,19 +119,19 @@ def trainers_onlinegradientdescentregressor( if decrease_learning_rate is not None: inputs['DecreaseLearningRate'] = try_set( obj=decrease_learning_rate, none_acceptable=True, is_of_type=bool) - if l2_regularizer_weight is not None: - inputs['L2RegularizerWeight'] = try_set( - obj=l2_regularizer_weight, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) - if num_iterations is not None: - inputs['NumIterations'] = try_set( - obj=num_iterations, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) - if init_wts_diameter is not None: - inputs['InitWtsDiameter'] = try_set( - obj=init_wts_diameter, + if initial_weights_diameter is not None: + inputs['InitialWeightsDiameter'] = try_set( + obj=initial_weights_diameter, none_acceptable=True, is_of_type=numbers.Real) if reset_weights_after_x_examples is not None: @@ -143,9 +139,9 @@ def trainers_onlinegradientdescentregressor( obj=reset_weights_after_x_examples, none_acceptable=True, is_of_type=numbers.Real) - if do_lazy_updates is not None: - inputs['DoLazyUpdates'] = try_set( - obj=do_lazy_updates, + if lazy_update is not None: + inputs['LazyUpdate'] = try_set( + obj=lazy_update, none_acceptable=True, is_of_type=bool) if recency_gain is not None: @@ -153,9 +149,9 @@ def trainers_onlinegradientdescentregressor( obj=recency_gain, none_acceptable=True, is_of_type=numbers.Real) - if recency_gain_multi is not None: - inputs['RecencyGainMulti'] = try_set( - obj=recency_gain_multi, + if recency_gain_multiplicative is not None: + inputs['RecencyGainMultiplicative'] = try_set( + obj=recency_gain_multiplicative, none_acceptable=True, is_of_type=bool) if averaged is not None: @@ -178,11 +174,6 @@ def trainers_onlinegradientdescentregressor( obj=shuffle, none_acceptable=True, is_of_type=bool) - if streaming_cache_size is not None: - inputs['StreamingCacheSize'] = try_set( - obj=streaming_cache_size, - none_acceptable=True, - is_of_type=numbers.Real) if predictor_model is not None: outputs['PredictorModel'] = try_set( obj=predictor_model, none_acceptable=False, is_of_type=str) diff --git a/src/python/nimbusml/internal/entrypoints/trainers_ordinaryleastsquaresregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_ordinaryleastsquaresregressor.py index 69b67034..a342d1bc 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_ordinaryleastsquaresregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_ordinaryleastsquaresregressor.py @@ -12,29 +12,30 @@ def trainers_ordinaryleastsquaresregressor( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', - weight_column=None, + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', - l2_weight=1e-06, - per_parameter_significance=True, + l2_regularization=1e-06, + calculate_statistics=True, **params): """ **Description** Train an OLS regression model. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param l2_weight: L2 regularization weight (inputs). - :param per_parameter_significance: Whether to calculate per - parameter significance statistics (inputs). + :param l2_regularization: L2 regularization weight (inputs). + :param calculate_statistics: Whether to calculate per parameter + significance statistics (inputs). :param predictor_model: The trained model (outputs). """ @@ -47,21 +48,21 @@ def trainers_ordinaryleastsquaresregressor( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -83,18 +84,15 @@ def trainers_ordinaryleastsquaresregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if l2_weight is not None: - inputs['L2Weight'] = try_set( - obj=l2_weight, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) - if per_parameter_significance is not None: - inputs['PerParameterSignificance'] = try_set( - obj=per_parameter_significance, - none_acceptable=True, - is_of_type=bool) + if calculate_statistics is not None: + inputs['CalculateStatistics'] = try_set( + obj=calculate_statistics, none_acceptable=True, is_of_type=bool) if predictor_model is not None: outputs['PredictorModel'] = try_set( obj=predictor_model, none_acceptable=False, is_of_type=str) diff --git a/src/python/nimbusml/internal/entrypoints/trainers_pcaanomalydetector.py b/src/python/nimbusml/internal/entrypoints/trainers_pcaanomalydetector.py index 490d006d..8329c023 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_pcaanomalydetector.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_pcaanomalydetector.py @@ -12,8 +12,8 @@ def trainers_pcaanomalydetector( training_data, predictor_model=None, - feature_column='Features', - weight_column=None, + feature_column_name='Features', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', rank=20, @@ -26,11 +26,12 @@ def trainers_pcaanomalydetector( Train an PCA Anomaly model. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param rank: The number of components in the PCA (inputs). :param oversampling: Oversampling parameter for randomized PCA @@ -50,15 +51,15 @@ def trainers_pcaanomalydetector( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -80,7 +81,6 @@ def trainers_pcaanomalydetector( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if rank is not None: inputs['Rank'] = try_set( diff --git a/src/python/nimbusml/internal/entrypoints/trainers_poissonregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_poissonregressor.py index 12a95a0e..8b11aaa2 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_poissonregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_poissonregressor.py @@ -12,22 +12,22 @@ def trainers_poissonregressor( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', - weight_column=None, + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_initialization_tolerance=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - num_threads=None, + number_of_threads=None, dense_optimizer=False, **params): """ @@ -35,30 +35,34 @@ def trainers_poissonregressor( Train an Poisson regression model. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). - :param l2_weight: L2 regularization weight (inputs). - :param l1_weight: L1 regularization weight (inputs). - :param opt_tol: Tolerance parameter for optimization convergence. - Low = slower, more accurate (inputs). - :param memory_size: Memory size for L-BFGS. Low=faster, less + :param l2_regularization: L2 regularization weight (inputs). + :param l1_regularization: L1 regularization weight (inputs). + :param optimization_tolerance: Tolerance parameter for + optimization convergence. Low = slower, more accurate + (inputs). + :param history_size: Memory size for L-BFGS. Low=faster, less accurate (inputs). :param enforce_non_negativity: Enforce non-negative weights (inputs). - :param init_wts_diameter: Init weights diameter (inputs). - :param max_iterations: Maximum iterations. (inputs). - :param sgd_initialization_tolerance: Run SGD to initialize LR - weights, converging to this tolerance (inputs). + :param initial_weights_diameter: Init weights diameter (inputs). + :param maximum_number_of_iterations: Maximum iterations. + (inputs). + :param stochastic_gradient_descent_initilaization_tolerance: Run + SGD to initialize LR weights, converging to this tolerance + (inputs). :param quiet: If set to true, produce no output during training. (inputs). :param use_threads: Whether or not to use threads. Default is true (inputs). - :param num_threads: Number of threads (inputs). + :param number_of_threads: Number of threads (inputs). :param dense_optimizer: Force densification of the internal optimization vectors (inputs). :param predictor_model: The trained model (outputs). @@ -73,21 +77,21 @@ def trainers_poissonregressor( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -109,44 +113,43 @@ def trainers_poissonregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) - if l2_weight is not None: - inputs['L2Weight'] = try_set( - obj=l2_weight, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) - if l1_weight is not None: - inputs['L1Weight'] = try_set( - obj=l1_weight, + if l1_regularization is not None: + inputs['L1Regularization'] = try_set( + obj=l1_regularization, none_acceptable=True, is_of_type=numbers.Real) - if opt_tol is not None: - inputs['OptTol'] = try_set( - obj=opt_tol, + if optimization_tolerance is not None: + inputs['OptimizationTolerance'] = try_set( + obj=optimization_tolerance, none_acceptable=True, is_of_type=numbers.Real) - if memory_size is not None: - inputs['MemorySize'] = try_set( - obj=memory_size, + if history_size is not None: + inputs['HistorySize'] = try_set( + obj=history_size, none_acceptable=True, is_of_type=numbers.Real) if enforce_non_negativity is not None: inputs['EnforceNonNegativity'] = try_set( obj=enforce_non_negativity, none_acceptable=True, is_of_type=bool) - if init_wts_diameter is not None: - inputs['InitWtsDiameter'] = try_set( - obj=init_wts_diameter, + if initial_weights_diameter is not None: + inputs['InitialWeightsDiameter'] = try_set( + obj=initial_weights_diameter, none_acceptable=True, is_of_type=numbers.Real) - if max_iterations is not None: - inputs['MaxIterations'] = try_set( - obj=max_iterations, + if maximum_number_of_iterations is not None: + inputs['MaximumNumberOfIterations'] = try_set( + obj=maximum_number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) - if sgd_initialization_tolerance is not None: - inputs['SgdInitializationTolerance'] = try_set( - obj=sgd_initialization_tolerance, + if stochastic_gradient_descent_initilaization_tolerance is not None: + inputs['StochasticGradientDescentInitilaizationTolerance'] = try_set( + obj=stochastic_gradient_descent_initilaization_tolerance, none_acceptable=True, is_of_type=numbers.Real) if quiet is not None: @@ -159,9 +162,9 @@ def trainers_poissonregressor( obj=use_threads, none_acceptable=True, is_of_type=bool) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) if dense_optimizer is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentbinaryclassifier.py index a72847ef..b5317cb1 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentbinaryclassifier.py @@ -12,29 +12,30 @@ def trainers_stochasticdualcoordinateascentbinaryclassifier( training_data, predictor_model=None, - l2_const=None, + l2_regularization=None, l1_threshold=None, - feature_column='Features', - label_column='Label', + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', loss_function=None, - num_threads=None, - positive_instance_weight=1.0, + number_of_threads=None, calibrator=None, max_calibration_examples=1000000, + positive_instance_weight=1.0, convergence_tolerance=0.1, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=0.0, **params): """ **Description** Train an SDCA binary model. - :param l2_const: L2 regularizer constant. By default the l2 - constant is automatically inferred based on data set. + :param l2_regularization: L2 regularizer constant. By default the + l2 constant is automatically inferred based on data set. (inputs). :param training_data: The data to be used for training (inputs). :param l1_threshold: L1 soft threshold (L1/L2). Note that it is @@ -42,31 +43,34 @@ def trainers_stochasticdualcoordinateascentbinaryclassifier( than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param loss_function: Loss Function (inputs). - :param num_threads: Degree of lock-free parallelism. Defaults to - automatic. Determinism not guaranteed. (inputs). - :param positive_instance_weight: Apply weight to the positive - class, for imbalanced data (inputs). + :param number_of_threads: Degree of lock-free parallelism. + Defaults to automatic. Determinism not guaranteed. (inputs). :param calibrator: The calibrator kind to apply to the predictor. Specify null for no calibration (inputs). :param max_calibration_examples: The maximum number of examples to use when training the calibrator (inputs). + :param positive_instance_weight: Apply weight to the positive + class, for imbalanced data (inputs). :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. (inputs). - :param max_iterations: Maximum number of iterations; set to 1 to - simulate online learning. Defaults to automatic. (inputs). + :param maximum_number_of_iterations: Maximum number of + iterations; set to 1 to simulate online learning. Defaults to + automatic. (inputs). :param shuffle: Shuffle data every epoch? (inputs). - :param check_frequency: Convergence check frequency (in terms of - number of iterations). Set as negative or zero for not - checking at all. If left blank, it defaults to check after - every 'numThreads' iterations. (inputs). + :param convergence_check_frequency: Convergence check frequency + (in terms of number of iterations). Set as negative or zero + for not checking at all. If left blank, it defaults to check + after every 'numThreads' iterations. (inputs). :param bias_learning_rate: The learning rate for adjusting bias from being regularized. (inputs). :param predictor_model: The trained model (outputs). @@ -76,9 +80,9 @@ def trainers_stochasticdualcoordinateascentbinaryclassifier( inputs = {} outputs = {} - if l2_const is not None: - inputs['L2Const'] = try_set( - obj=l2_const, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -91,15 +95,21 @@ def trainers_stochasticdualcoordinateascentbinaryclassifier( obj=l1_threshold, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -121,21 +131,15 @@ def trainers_stochasticdualcoordinateascentbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if loss_function is not None: inputs['LossFunction'] = try_set( obj=loss_function, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, - none_acceptable=True, - is_of_type=numbers.Real) - if positive_instance_weight is not None: - inputs['PositiveInstanceWeight'] = try_set( - obj=positive_instance_weight, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) if calibrator is not None: @@ -148,14 +152,19 @@ def trainers_stochasticdualcoordinateascentbinaryclassifier( obj=max_calibration_examples, none_acceptable=True, is_of_type=numbers.Real) + if positive_instance_weight is not None: + inputs['PositiveInstanceWeight'] = try_set( + obj=positive_instance_weight, + none_acceptable=True, + is_of_type=numbers.Real) if convergence_tolerance is not None: inputs['ConvergenceTolerance'] = try_set( obj=convergence_tolerance, none_acceptable=True, is_of_type=numbers.Real) - if max_iterations is not None: - inputs['MaxIterations'] = try_set( - obj=max_iterations, + if maximum_number_of_iterations is not None: + inputs['MaximumNumberOfIterations'] = try_set( + obj=maximum_number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if shuffle is not None: @@ -163,9 +172,9 @@ def trainers_stochasticdualcoordinateascentbinaryclassifier( obj=shuffle, none_acceptable=True, is_of_type=bool) - if check_frequency is not None: - inputs['CheckFrequency'] = try_set( - obj=check_frequency, + if convergence_check_frequency is not None: + inputs['ConvergenceCheckFrequency'] = try_set( + obj=convergence_check_frequency, none_acceptable=True, is_of_type=numbers.Real) if bias_learning_rate is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentclassifier.py index dad5759d..6cf8b75b 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentclassifier.py @@ -12,26 +12,27 @@ def trainers_stochasticdualcoordinateascentclassifier( training_data, predictor_model=None, - l2_const=None, + l2_regularization=None, l1_threshold=None, - feature_column='Features', - label_column='Label', + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', loss_function=None, - num_threads=None, + number_of_threads=None, convergence_tolerance=0.1, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=0.0, **params): """ **Description** The SDCA linear multi-class classification trainer. - :param l2_const: L2 regularizer constant. By default the l2 - constant is automatically inferred based on data set. + :param l2_regularization: L2 regularizer constant. By default the + l2 constant is automatically inferred based on data set. (inputs). :param training_data: The data to be used for training (inputs). :param l1_threshold: L1 soft threshold (L1/L2). Note that it is @@ -39,25 +40,28 @@ def trainers_stochasticdualcoordinateascentclassifier( than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param loss_function: Loss Function (inputs). - :param num_threads: Degree of lock-free parallelism. Defaults to - automatic. Determinism not guaranteed. (inputs). + :param number_of_threads: Degree of lock-free parallelism. + Defaults to automatic. Determinism not guaranteed. (inputs). :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. (inputs). - :param max_iterations: Maximum number of iterations; set to 1 to - simulate online learning. Defaults to automatic. (inputs). + :param maximum_number_of_iterations: Maximum number of + iterations; set to 1 to simulate online learning. Defaults to + automatic. (inputs). :param shuffle: Shuffle data every epoch? (inputs). - :param check_frequency: Convergence check frequency (in terms of - number of iterations). Set as negative or zero for not - checking at all. If left blank, it defaults to check after - every 'numThreads' iterations. (inputs). + :param convergence_check_frequency: Convergence check frequency + (in terms of number of iterations). Set as negative or zero + for not checking at all. If left blank, it defaults to check + after every 'numThreads' iterations. (inputs). :param bias_learning_rate: The learning rate for adjusting bias from being regularized. (inputs). :param predictor_model: The trained model (outputs). @@ -67,9 +71,9 @@ def trainers_stochasticdualcoordinateascentclassifier( inputs = {} outputs = {} - if l2_const is not None: - inputs['L2Const'] = try_set( - obj=l2_const, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -82,15 +86,21 @@ def trainers_stochasticdualcoordinateascentclassifier( obj=l1_threshold, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -112,16 +122,15 @@ def trainers_stochasticdualcoordinateascentclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if loss_function is not None: inputs['LossFunction'] = try_set( obj=loss_function, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) if convergence_tolerance is not None: @@ -129,9 +138,9 @@ def trainers_stochasticdualcoordinateascentclassifier( obj=convergence_tolerance, none_acceptable=True, is_of_type=numbers.Real) - if max_iterations is not None: - inputs['MaxIterations'] = try_set( - obj=max_iterations, + if maximum_number_of_iterations is not None: + inputs['MaximumNumberOfIterations'] = try_set( + obj=maximum_number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if shuffle is not None: @@ -139,9 +148,9 @@ def trainers_stochasticdualcoordinateascentclassifier( obj=shuffle, none_acceptable=True, is_of_type=bool) - if check_frequency is not None: - inputs['CheckFrequency'] = try_set( - obj=check_frequency, + if convergence_check_frequency is not None: + inputs['ConvergenceCheckFrequency'] = try_set( + obj=convergence_check_frequency, none_acceptable=True, is_of_type=numbers.Real) if bias_learning_rate is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentregressor.py index 2f3487a2..45589a41 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_stochasticdualcoordinateascentregressor.py @@ -12,26 +12,27 @@ def trainers_stochasticdualcoordinateascentregressor( training_data, predictor_model=None, - l2_const=None, + l2_regularization=None, l1_threshold=None, - feature_column='Features', - label_column='Label', + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', loss_function=None, - num_threads=None, + number_of_threads=None, convergence_tolerance=0.01, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=1.0, **params): """ **Description** The SDCA linear regression trainer. - :param l2_const: L2 regularizer constant. By default the l2 - constant is automatically inferred based on data set. + :param l2_regularization: L2 regularizer constant. By default the + l2 constant is automatically inferred based on data set. (inputs). :param training_data: The data to be used for training (inputs). :param l1_threshold: L1 soft threshold (L1/L2). Note that it is @@ -39,25 +40,28 @@ def trainers_stochasticdualcoordinateascentregressor( than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param loss_function: Loss Function (inputs). - :param num_threads: Degree of lock-free parallelism. Defaults to - automatic. Determinism not guaranteed. (inputs). + :param number_of_threads: Degree of lock-free parallelism. + Defaults to automatic. Determinism not guaranteed. (inputs). :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. (inputs). - :param max_iterations: Maximum number of iterations; set to 1 to - simulate online learning. Defaults to automatic. (inputs). + :param maximum_number_of_iterations: Maximum number of + iterations; set to 1 to simulate online learning. Defaults to + automatic. (inputs). :param shuffle: Shuffle data every epoch? (inputs). - :param check_frequency: Convergence check frequency (in terms of - number of iterations). Set as negative or zero for not - checking at all. If left blank, it defaults to check after - every 'numThreads' iterations. (inputs). + :param convergence_check_frequency: Convergence check frequency + (in terms of number of iterations). Set as negative or zero + for not checking at all. If left blank, it defaults to check + after every 'numThreads' iterations. (inputs). :param bias_learning_rate: The learning rate for adjusting bias from being regularized. (inputs). :param predictor_model: The trained model (outputs). @@ -67,9 +71,9 @@ def trainers_stochasticdualcoordinateascentregressor( inputs = {} outputs = {} - if l2_const is not None: - inputs['L2Const'] = try_set( - obj=l2_const, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) if training_data is not None: @@ -82,15 +86,21 @@ def trainers_stochasticdualcoordinateascentregressor( obj=l1_threshold, none_acceptable=True, is_of_type=numbers.Real) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -112,16 +122,15 @@ def trainers_stochasticdualcoordinateascentregressor( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if loss_function is not None: inputs['LossFunction'] = try_set( obj=loss_function, none_acceptable=True, is_of_type=dict) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, none_acceptable=True, is_of_type=numbers.Real) if convergence_tolerance is not None: @@ -129,9 +138,9 @@ def trainers_stochasticdualcoordinateascentregressor( obj=convergence_tolerance, none_acceptable=True, is_of_type=numbers.Real) - if max_iterations is not None: - inputs['MaxIterations'] = try_set( - obj=max_iterations, + if maximum_number_of_iterations is not None: + inputs['MaximumNumberOfIterations'] = try_set( + obj=maximum_number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) if shuffle is not None: @@ -139,9 +148,9 @@ def trainers_stochasticdualcoordinateascentregressor( obj=shuffle, none_acceptable=True, is_of_type=bool) - if check_frequency is not None: - inputs['CheckFrequency'] = try_set( - obj=check_frequency, + if convergence_check_frequency is not None: + inputs['ConvergenceCheckFrequency'] = try_set( + obj=convergence_check_frequency, none_acceptable=True, is_of_type=numbers.Real) if bias_learning_rate is not None: diff --git a/src/python/nimbusml/internal/entrypoints/trainers_stochasticgradientdescentbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_stochasticgradientdescentbinaryclassifier.py index 59064c2d..68800069 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_stochasticgradientdescentbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_stochasticgradientdescentbinaryclassifier.py @@ -12,45 +12,50 @@ def trainers_stochasticgradientdescentbinaryclassifier( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', - weight_column=None, + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, normalize_features='Auto', caching='Auto', loss_function=None, - l2_weight=1e-06, - num_threads=None, + l2_regularization=1e-06, + number_of_threads=None, + calibrator=None, + max_calibration_examples=1000000, convergence_tolerance=0.0001, - max_iterations=20, - init_learning_rate=0.01, + number_of_iterations=20, + initial_learning_rate=0.01, shuffle=True, positive_instance_weight=1.0, check_frequency=None, - calibrator=None, - max_calibration_examples=1000000, **params): """ **Description** Train an Hogwild SGD binary model. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). - :param weight_column: Column to use for example weight (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param loss_function: Loss Function (inputs). - :param l2_weight: L2 Regularization constant (inputs). - :param num_threads: Degree of lock-free parallelism. Defaults to - automatic depending on data sparseness. Determinism not - guaranteed. (inputs). + :param l2_regularization: L2 Regularization constant (inputs). + :param number_of_threads: Degree of lock-free parallelism. + Defaults to automatic depending on data sparseness. + Determinism not guaranteed. (inputs). + :param calibrator: The calibrator kind to apply to the predictor. + Specify null for no calibration (inputs). + :param max_calibration_examples: The maximum number of examples + to use when training the calibrator (inputs). :param convergence_tolerance: Exponential moving averaged improvement tolerance for convergence (inputs). - :param max_iterations: Maximum number of iterations; set to 1 to - simulate online learning. (inputs). - :param init_learning_rate: Initial learning rate (only used by + :param number_of_iterations: Maximum number of iterations; set to + 1 to simulate online learning. (inputs). + :param initial_learning_rate: Initial learning rate (only used by SGD) (inputs). :param shuffle: Shuffle data every epoch? (inputs). :param positive_instance_weight: Apply weight to the positive @@ -58,10 +63,6 @@ def trainers_stochasticgradientdescentbinaryclassifier( :param check_frequency: Convergence check frequency (in terms of number of iterations). Default equals number of threads (inputs). - :param calibrator: The calibrator kind to apply to the predictor. - Specify null for no calibration (inputs). - :param max_calibration_examples: The maximum number of examples - to use when training the calibrator (inputs). :param predictor_model: The trained model (outputs). """ @@ -74,21 +75,21 @@ def trainers_stochasticgradientdescentbinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -110,21 +111,30 @@ def trainers_stochasticgradientdescentbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if loss_function is not None: inputs['LossFunction'] = try_set( obj=loss_function, none_acceptable=True, is_of_type=dict) - if l2_weight is not None: - inputs['L2Weight'] = try_set( - obj=l2_weight, + if l2_regularization is not None: + inputs['L2Regularization'] = try_set( + obj=l2_regularization, none_acceptable=True, is_of_type=numbers.Real) - if num_threads is not None: - inputs['NumThreads'] = try_set( - obj=num_threads, + if number_of_threads is not None: + inputs['NumberOfThreads'] = try_set( + obj=number_of_threads, + none_acceptable=True, + is_of_type=numbers.Real) + if calibrator is not None: + inputs['Calibrator'] = try_set( + obj=calibrator, + none_acceptable=True, + is_of_type=dict) + if max_calibration_examples is not None: + inputs['MaxCalibrationExamples'] = try_set( + obj=max_calibration_examples, none_acceptable=True, is_of_type=numbers.Real) if convergence_tolerance is not None: @@ -132,14 +142,14 @@ def trainers_stochasticgradientdescentbinaryclassifier( obj=convergence_tolerance, none_acceptable=True, is_of_type=numbers.Real) - if max_iterations is not None: - inputs['MaxIterations'] = try_set( - obj=max_iterations, + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, none_acceptable=True, is_of_type=numbers.Real) - if init_learning_rate is not None: - inputs['InitLearningRate'] = try_set( - obj=init_learning_rate, + if initial_learning_rate is not None: + inputs['InitialLearningRate'] = try_set( + obj=initial_learning_rate, none_acceptable=True, is_of_type=numbers.Real) if shuffle is not None: @@ -157,16 +167,6 @@ def trainers_stochasticgradientdescentbinaryclassifier( obj=check_frequency, none_acceptable=True, is_of_type=numbers.Real) - if calibrator is not None: - inputs['Calibrator'] = try_set( - obj=calibrator, - none_acceptable=True, - is_of_type=dict) - if max_calibration_examples is not None: - inputs['MaxCalibrationExamples'] = try_set( - obj=max_calibration_examples, - none_acceptable=True, - is_of_type=numbers.Real) if predictor_model is not None: outputs['PredictorModel'] = try_set( obj=predictor_model, none_acceptable=False, is_of_type=str) diff --git a/src/python/nimbusml/internal/entrypoints/trainers_symsgdbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_symsgdbinaryclassifier.py index 5d2ba43d..3b1d3b40 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_symsgdbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_symsgdbinaryclassifier.py @@ -12,8 +12,8 @@ def trainers_symsgdbinaryclassifier( training_data, predictor_model=None, - feature_column='Features', - label_column='Label', + feature_column_name='Features', + label_column_name='Label', normalize_features='Auto', caching='Auto', number_of_iterations=50, @@ -31,11 +31,11 @@ def trainers_symsgdbinaryclassifier( Train a symbolic SGD. :param training_data: The data to be used for training (inputs). - :param feature_column: Column to use for features (inputs). - :param label_column: Column to use for labels (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). :param normalize_features: Normalize option for the feature column (inputs). - :param caching: Whether learner should cache input training data + :param caching: Whether trainer should cache input training data (inputs). :param number_of_iterations: Number of passes over the data. (inputs). @@ -67,15 +67,15 @@ def trainers_symsgdbinaryclassifier( obj=training_data, none_acceptable=False, is_of_type=str) - if feature_column is not None: - inputs['FeatureColumn'] = try_set( - obj=feature_column, + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, none_acceptable=True, is_of_type=str, is_column=True) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) @@ -97,7 +97,6 @@ def trainers_symsgdbinaryclassifier( values=[ 'Auto', 'Memory', - 'Disk', 'None']) if number_of_iterations is not None: inputs['NumberOfIterations'] = try_set( diff --git a/src/python/nimbusml/internal/entrypoints/transforms_categoricalhashonehotvectorizer.py b/src/python/nimbusml/internal/entrypoints/transforms_categoricalhashonehotvectorizer.py index 49ca7c20..9976119a 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_categoricalhashonehotvectorizer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_categoricalhashonehotvectorizer.py @@ -14,11 +14,11 @@ def transforms_categoricalhashonehotvectorizer( data, output_data=None, model=None, - hash_bits=16, + number_of_bits=16, output_kind='Bag', seed=314489979, ordered=True, - invert_hash=0, + maximum_number_of_inverts=0, **params): """ **Description** @@ -28,18 +28,18 @@ def transforms_categoricalhashonehotvectorizer( it. :param column: New column definition(s) (optional form: - name:hashBits:src) (inputs). + name:numberOfBits:src) (inputs). :param data: Input dataset (inputs). - :param hash_bits: Number of bits to hash into. Must be between 1 - and 30, inclusive. (inputs). + :param number_of_bits: Number of bits to hash into. Must be + between 1 and 30, inclusive. (inputs). :param output_kind: Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) (inputs). :param seed: Hashing seed (inputs). :param ordered: Whether the position of each term should be included in the hash (inputs). - :param invert_hash: Limit the number of keys used to generate the - slot name to this many. 0 means no invert hashing, -1 means - no limit. (inputs). + :param maximum_number_of_inverts: Limit the number of keys used + to generate the slot name to this many. 0 means no invert + hashing, -1 means no limit. (inputs). :param output_data: Transformed dataset (outputs). :param model: Transform model (outputs). """ @@ -59,9 +59,9 @@ def transforms_categoricalhashonehotvectorizer( obj=data, none_acceptable=False, is_of_type=str) - if hash_bits is not None: - inputs['HashBits'] = try_set( - obj=hash_bits, + if number_of_bits is not None: + inputs['NumberOfBits'] = try_set( + obj=number_of_bits, none_acceptable=True, is_of_type=numbers.Real) if output_kind is not None: @@ -71,9 +71,9 @@ def transforms_categoricalhashonehotvectorizer( is_of_type=str, values=[ 'Bag', - 'Ind', + 'Indicator', 'Key', - 'Bin']) + 'Binary']) if seed is not None: inputs['Seed'] = try_set( obj=seed, @@ -84,9 +84,9 @@ def transforms_categoricalhashonehotvectorizer( obj=ordered, none_acceptable=True, is_of_type=bool) - if invert_hash is not None: - inputs['InvertHash'] = try_set( - obj=invert_hash, + if maximum_number_of_inverts is not None: + inputs['MaximumNumberOfInverts'] = try_set( + obj=maximum_number_of_inverts, none_acceptable=True, is_of_type=numbers.Real) if output_data is not None: diff --git a/src/python/nimbusml/internal/entrypoints/transforms_categoricalonehotvectorizer.py b/src/python/nimbusml/internal/entrypoints/transforms_categoricalonehotvectorizer.py index a0db9a0e..b0fd931e 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_categoricalonehotvectorizer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_categoricalonehotvectorizer.py @@ -15,9 +15,9 @@ def transforms_categoricalonehotvectorizer( output_data=None, model=None, max_num_terms=1000000, - output_kind='Ind', + output_kind='Indicator', term=None, - sort='Occurrence', + sort='ByOccurrence', text_key_values=True, **params): """ @@ -29,7 +29,7 @@ def transforms_categoricalonehotvectorizer( :param column: New column definition(s) (optional form: name:src) (inputs). :param data: Input dataset (inputs). - :param max_num_terms: Maximum number of terms to keep per column + :param max_num_terms: Maximum number of keys to keep per column when auto-training (inputs). :param output_kind: Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) (inputs). @@ -72,9 +72,9 @@ def transforms_categoricalonehotvectorizer( is_of_type=str, values=[ 'Bag', - 'Ind', + 'Indicator', 'Key', - 'Bin']) + 'Binary']) if term is not None: inputs['Term'] = try_set( obj=term, @@ -86,8 +86,8 @@ def transforms_categoricalonehotvectorizer( none_acceptable=True, is_of_type=str, values=[ - 'Occurrence', - 'Value']) + 'ByOccurrence', + 'ByValue']) if text_key_values is not None: inputs['TextKeyValues'] = try_set( obj=text_key_values, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_dictionarizer.py b/src/python/nimbusml/internal/entrypoints/transforms_dictionarizer.py index 36f27d22..107273f9 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_dictionarizer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_dictionarizer.py @@ -16,7 +16,7 @@ def transforms_dictionarizer( column=None, max_num_terms=1000000, term=None, - sort='Occurrence', + sort='ByOccurrence', text_key_values=False, **params): """ @@ -27,7 +27,7 @@ def transforms_dictionarizer( :param column: New column definition(s) (optional form: name:src) (inputs). :param data: Input dataset (inputs). - :param max_num_terms: Maximum number of terms to keep per column + :param max_num_terms: Maximum number of keys to keep per column when auto-training (inputs). :param term: List of terms (inputs). :param sort: How items should be ordered when vectorized. By @@ -72,8 +72,8 @@ def transforms_dictionarizer( none_acceptable=True, is_of_type=str, values=[ - 'Occurrence', - 'Value']) + 'ByOccurrence', + 'ByValue']) if text_key_values is not None: inputs['TextKeyValues'] = try_set( obj=text_key_values, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_featureselectorbymutualinformation.py b/src/python/nimbusml/internal/entrypoints/transforms_featureselectorbymutualinformation.py index b87a45c4..0663f8cd 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_featureselectorbymutualinformation.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_featureselectorbymutualinformation.py @@ -15,7 +15,7 @@ def transforms_featureselectorbymutualinformation( output_data=None, model=None, slots_in_output=1000, - label_column='Label', + label_column_name='Label', num_bins=256, **params): """ @@ -27,7 +27,7 @@ def transforms_featureselectorbymutualinformation( :param slots_in_output: The maximum number of slots to preserve in output (inputs). :param data: Input dataset (inputs). - :param label_column: Column to use for labels (inputs). + :param label_column_name: Column to use for labels (inputs). :param num_bins: Max number of bins for R4/R8 columns, power of 2 recommended (inputs). :param output_data: Transformed dataset (outputs). @@ -54,9 +54,9 @@ def transforms_featureselectorbymutualinformation( obj=data, none_acceptable=False, is_of_type=str) - if label_column is not None: + if label_column_name is not None: inputs['LabelColumn'] = try_set( - obj=label_column, + obj=label_column_name, none_acceptable=True, is_of_type=str, is_column=True) diff --git a/src/python/nimbusml/internal/entrypoints/transforms_hashconverter.py b/src/python/nimbusml/internal/entrypoints/transforms_hashconverter.py index b110aa34..4982aeb8 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_hashconverter.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_hashconverter.py @@ -14,7 +14,7 @@ def transforms_hashconverter( data, output_data=None, model=None, - hash_bits=31, + number_of_bits=31, join=True, seed=314489979, ordered=True, @@ -28,8 +28,8 @@ def transforms_hashconverter( :param column: New column definition(s) (optional form: name:src) (inputs). :param data: Input dataset (inputs). - :param hash_bits: Number of bits to hash into. Must be between 1 - and 31, inclusive. (inputs). + :param number_of_bits: Number of bits to hash into. Must be + between 1 and 31, inclusive. (inputs). :param join: Whether the values need to be combined for a single hash (inputs). :param seed: Hashing seed (inputs). @@ -54,9 +54,9 @@ def transforms_hashconverter( obj=data, none_acceptable=False, is_of_type=str) - if hash_bits is not None: - inputs['HashBits'] = try_set( - obj=hash_bits, + if number_of_bits is not None: + inputs['NumberOfBits'] = try_set( + obj=number_of_bits, none_acceptable=True, is_of_type=numbers.Real) if join is not None: diff --git a/src/python/nimbusml/internal/entrypoints/transforms_imagepixelextractor.py b/src/python/nimbusml/internal/entrypoints/transforms_imagepixelextractor.py index f7ac56c9..9e17868f 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_imagepixelextractor.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_imagepixelextractor.py @@ -18,7 +18,8 @@ def transforms_imagepixelextractor( use_red=True, use_green=True, use_blue=True, - interleave_argb=False, + order='ARGB', + interleave=False, convert=True, offset=None, scale=None, @@ -35,8 +36,9 @@ def transforms_imagepixelextractor( :param use_red: Whether to use red channel (inputs). :param use_green: Whether to use green channel (inputs). :param use_blue: Whether to use blue channel (inputs). - :param interleave_argb: Whether to separate each channel or - interleave in ARGB order (inputs). + :param order: Order of colors. (inputs). + :param interleave: Whether to separate each channel or interleave + in specified order (inputs). :param convert: Whether to convert to floating point (inputs). :param offset: Offset (pre-scale) (inputs). :param scale: Scale factor (inputs). @@ -79,9 +81,21 @@ def transforms_imagepixelextractor( obj=use_blue, none_acceptable=True, is_of_type=bool) - if interleave_argb is not None: - inputs['InterleaveArgb'] = try_set( - obj=interleave_argb, + if order is not None: + inputs['Order'] = try_set( + obj=order, + none_acceptable=True, + is_of_type=str, + values=[ + 'ARGB', + 'ARBG', + 'ABRG', + 'ABGR', + 'AGRB', + 'AGBR']) + if interleave is not None: + inputs['Interleave'] = try_set( + obj=interleave, none_acceptable=True, is_of_type=bool) if convert is not None: diff --git a/src/python/nimbusml/internal/entrypoints/transforms_imageresizer.py b/src/python/nimbusml/internal/entrypoints/transforms_imageresizer.py index 1c9b3094..091d7423 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_imageresizer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_imageresizer.py @@ -69,7 +69,8 @@ def transforms_imageresizer( is_of_type=str, values=[ 'IsoPad', - 'IsoCrop']) + 'IsoCrop', + 'Fill']) if crop_anchor is not None: inputs['CropAnchor'] = try_set( obj=crop_anchor, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_lpnormalizer.py b/src/python/nimbusml/internal/entrypoints/transforms_lpnormalizer.py index 15876bf8..14512725 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_lpnormalizer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_lpnormalizer.py @@ -13,7 +13,7 @@ def transforms_lpnormalizer( data, output_data=None, model=None, - norm_kind='L2Norm', + norm='L2', sub_mean=False, **params): """ @@ -25,8 +25,7 @@ def transforms_lpnormalizer( :param column: New column definition(s) (optional form: name:src) (inputs). - :param norm_kind: The norm to use to normalize each sample - (inputs). + :param norm: The norm to use to normalize each sample (inputs). :param data: Input dataset (inputs). :param sub_mean: Subtract mean from each value before normalizing (inputs). @@ -44,16 +43,16 @@ def transforms_lpnormalizer( none_acceptable=False, is_of_type=list, is_column=True) - if norm_kind is not None: - inputs['NormKind'] = try_set( - obj=norm_kind, + if norm is not None: + inputs['Norm'] = try_set( + obj=norm, none_acceptable=True, is_of_type=str, values=[ - 'L2Norm', - 'StdDev', - 'L1Norm', - 'LInf']) + 'L2', + 'StandardDeviation', + 'L1', + 'Infinity']) if data is not None: inputs['Data'] = try_set( obj=data, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_ngramtranslator.py b/src/python/nimbusml/internal/entrypoints/transforms_ngramtranslator.py index 64fb855d..61d63e92 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_ngramtranslator.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_ngramtranslator.py @@ -22,20 +22,20 @@ def transforms_ngramtranslator( **params): """ **Description** - Produces a bag of counts of ngrams (sequences of consecutive values + Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by - building a dictionary of ngrams and using the id in the + building a dictionary of n-grams and using the id in the dictionary as the index in the bag. :param column: New column definition(s) (optional form: name:src) (inputs). :param data: Input dataset (inputs). - :param ngram_length: Maximum ngram length (inputs). - :param all_lengths: Whether to store all ngram lengths up to + :param ngram_length: Maximum n-gram length (inputs). + :param all_lengths: Whether to store all n-gram lengths up to ngramLength, or only ngramLength (inputs). :param skip_length: Maximum number of tokens to skip when - constructing an ngram (inputs). - :param max_num_terms: Maximum number of ngrams to store in the + constructing an n-gram (inputs). + :param max_num_terms: Maximum number of n-grams to store in the dictionary (inputs). :param weighting: The weighting criteria (inputs). :param output_data: Transformed dataset (outputs). diff --git a/src/python/nimbusml/internal/entrypoints/transforms_pcacalculator.py b/src/python/nimbusml/internal/entrypoints/transforms_pcacalculator.py index c5255d30..67f4dd61 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_pcacalculator.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_pcacalculator.py @@ -14,7 +14,7 @@ def transforms_pcacalculator( data, output_data=None, model=None, - weight_column=None, + example_weight_column_name=None, rank=20, oversampling=20, center=True, @@ -28,7 +28,8 @@ def transforms_pcacalculator( :param column: New column definition(s) (optional form: name:src) (inputs). :param data: Input dataset (inputs). - :param weight_column: The name of the weight column (inputs). + :param example_weight_column_name: The name of the weight column + (inputs). :param rank: The number of components in the PCA (inputs). :param oversampling: Oversampling parameter for randomized PCA training (inputs). @@ -54,9 +55,9 @@ def transforms_pcacalculator( obj=data, none_acceptable=False, is_of_type=str) - if weight_column is not None: - inputs['WeightColumn'] = try_set( - obj=weight_column, + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, none_acceptable=True, is_of_type=str, is_column=True) diff --git a/src/python/nimbusml/internal/entrypoints/transforms_tensorflowscorer.py b/src/python/nimbusml/internal/entrypoints/transforms_tensorflowscorer.py index 2b1aa6e7..73dc2ebe 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_tensorflowscorer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_tensorflowscorer.py @@ -28,6 +28,7 @@ def transforms_tensorflowscorer( save_location_operation='save/Const', save_operation='save/control_dependency', re_train=False, + add_batch_dimension_inputs=False, **params): """ **Description** @@ -64,6 +65,9 @@ def transforms_tensorflowscorer( specifiy the location for saving/restoring models from disk. (inputs). :param re_train: Retrain TensorFlow model. (inputs). + :param add_batch_dimension_inputs: Add a batch dimension to the + input e.g. input = [224, 224, 3] => [-1, 224, 224, 3]. + (inputs). :param output_data: Transformed dataset (outputs). :param model: Transform model (outputs). """ @@ -144,6 +148,11 @@ def transforms_tensorflowscorer( obj=re_train, none_acceptable=True, is_of_type=bool) + if add_batch_dimension_inputs is not None: + inputs['AddBatchDimensionInputs'] = try_set( + obj=add_batch_dimension_inputs, + none_acceptable=True, + is_of_type=bool) if output_data is not None: outputs['OutputData'] = try_set( obj=output_data, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_textfeaturizer.py b/src/python/nimbusml/internal/entrypoints/transforms_textfeaturizer.py index 416f8e40..d549098a 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_textfeaturizer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_textfeaturizer.py @@ -15,12 +15,12 @@ def transforms_textfeaturizer( output_data=None, model=None, language='English', - use_predefined_stop_word_remover=False, + stop_words_remover=None, text_case='Lower', keep_diacritics=False, keep_punctuations=True, keep_numbers=True, - output_tokens=False, + output_tokens_column_name=None, dictionary=None, word_feature_extractor=n_gram( max_num_terms=[10000000]), @@ -34,15 +34,14 @@ def transforms_textfeaturizer( **Description** A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of - (word and/or character) ngrams in a given tokenized text. + (word and/or character) n-grams in a given tokenized text. :param column: New column definition (optional form: name:srcs). (inputs). :param data: Input dataset (inputs). :param language: Dataset language or 'AutoDetect' to detect language per row. (inputs). - :param use_predefined_stop_word_remover: Use stop remover or not. - (inputs). + :param stop_words_remover: Stopwords remover. (inputs). :param text_case: Casing text using the rules of the invariant culture. (inputs). :param keep_diacritics: Whether to keep diacritical marks or @@ -51,8 +50,8 @@ def transforms_textfeaturizer( remove them. (inputs). :param keep_numbers: Whether to keep numbers or remove them. (inputs). - :param output_tokens: Whether to output the transformed text - tokens as an additional column. (inputs). + :param output_tokens_column_name: Column containing the + transformed text tokens. (inputs). :param dictionary: A dictionary of whitelisted terms. (inputs). :param word_feature_extractor: Ngram feature extractor to use for words (WordBag/WordHashBag). (inputs). @@ -95,11 +94,11 @@ def transforms_textfeaturizer( 'Italian', 'Spanish', 'Japanese']) - if use_predefined_stop_word_remover is not None: - inputs['UsePredefinedStopWordRemover'] = try_set( - obj=use_predefined_stop_word_remover, + if stop_words_remover is not None: + inputs['StopWordsRemover'] = try_set( + obj=stop_words_remover, none_acceptable=True, - is_of_type=bool) + is_of_type=dict) if text_case is not None: inputs['TextCase'] = try_set( obj=text_case, @@ -124,11 +123,12 @@ def transforms_textfeaturizer( obj=keep_numbers, none_acceptable=True, is_of_type=bool) - if output_tokens is not None: - inputs['OutputTokens'] = try_set( - obj=output_tokens, + if output_tokens_column_name is not None: + inputs['OutputTokensColumnName'] = try_set( + obj=output_tokens_column_name, none_acceptable=True, - is_of_type=bool) + is_of_type=str, + is_column=True) if dictionary is not None: inputs['Dictionary'] = try_set( obj=dictionary, @@ -155,7 +155,7 @@ def transforms_textfeaturizer( 'None', 'L1', 'L2', - 'LInf']) + 'Infinity']) if output_data is not None: outputs['OutputData'] = try_set( obj=output_data, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_texttokeyconverter.py b/src/python/nimbusml/internal/entrypoints/transforms_texttokeyconverter.py index f28b10f0..80cb4ef0 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_texttokeyconverter.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_texttokeyconverter.py @@ -16,7 +16,7 @@ def transforms_texttokeyconverter( column=None, max_num_terms=1000000, term=None, - sort='Occurrence', + sort='ByOccurrence', text_key_values=False, **params): """ @@ -27,7 +27,7 @@ def transforms_texttokeyconverter( :param column: New column definition(s) (optional form: name:src) (inputs). :param data: Input dataset (inputs). - :param max_num_terms: Maximum number of terms to keep per column + :param max_num_terms: Maximum number of keys to keep per column when auto-training (inputs). :param term: List of terms (inputs). :param sort: How items should be ordered when vectorized. By @@ -72,8 +72,8 @@ def transforms_texttokeyconverter( none_acceptable=True, is_of_type=str, values=[ - 'Occurrence', - 'Value']) + 'ByOccurrence', + 'ByValue']) if text_key_values is not None: inputs['TextKeyValues'] = try_set( obj=text_key_values, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_vectortoimage.py b/src/python/nimbusml/internal/entrypoints/transforms_vectortoimage.py index 8444aab4..ccd2d9ef 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_vectortoimage.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_vectortoimage.py @@ -18,11 +18,16 @@ def transforms_vectortoimage( contains_red=True, contains_green=True, contains_blue=True, - interleave_argb=False, + order='ARGB', + interleave=False, image_width=0, image_height=0, - offset=None, - scale=None, + offset=0.0, + scale=1.0, + default_alpha=255, + default_red=0, + default_green=0, + default_blue=0, **params): """ **Description** @@ -35,12 +40,21 @@ def transforms_vectortoimage( :param contains_red: Whether to use red channel (inputs). :param contains_green: Whether to use green channel (inputs). :param contains_blue: Whether to use blue channel (inputs). - :param interleave_argb: Whether to separate each channel or - interleave in ARGB order (inputs). + :param order: Order of colors. (inputs). + :param interleave: Whether to separate each channel or interleave + in specified order (inputs). :param image_width: Width of the image (inputs). :param image_height: Height of the image (inputs). :param offset: Offset (pre-scale) (inputs). :param scale: Scale factor (inputs). + :param default_alpha: Default value for alpha channel. Will be + used if ContainsAlpha set to false (inputs). + :param default_red: Default value for red channel. Will be used + if ContainsRed set to false (inputs). + :param default_green: Default value for green channel. Will be + used if ContainsGreen set to false (inputs). + :param default_blue: Default value for blue channel. Will be used + if ContainsBlue set to false (inputs). :param output_data: Transformed dataset (outputs). :param model: Transform model (outputs). """ @@ -80,9 +94,21 @@ def transforms_vectortoimage( obj=contains_blue, none_acceptable=True, is_of_type=bool) - if interleave_argb is not None: - inputs['InterleaveArgb'] = try_set( - obj=interleave_argb, + if order is not None: + inputs['Order'] = try_set( + obj=order, + none_acceptable=True, + is_of_type=str, + values=[ + 'ARGB', + 'ARBG', + 'ABRG', + 'ABGR', + 'AGRB', + 'AGBR']) + if interleave is not None: + inputs['Interleave'] = try_set( + obj=interleave, none_acceptable=True, is_of_type=bool) if image_width is not None: @@ -105,6 +131,26 @@ def transforms_vectortoimage( obj=scale, none_acceptable=True, is_of_type=numbers.Real) + if default_alpha is not None: + inputs['DefaultAlpha'] = try_set( + obj=default_alpha, + none_acceptable=True, + is_of_type=numbers.Real) + if default_red is not None: + inputs['DefaultRed'] = try_set( + obj=default_red, + none_acceptable=True, + is_of_type=numbers.Real) + if default_green is not None: + inputs['DefaultGreen'] = try_set( + obj=default_green, + none_acceptable=True, + is_of_type=numbers.Real) + if default_blue is not None: + inputs['DefaultBlue'] = try_set( + obj=default_blue, + none_acceptable=True, + is_of_type=numbers.Real) if output_data is not None: outputs['OutputData'] = try_set( obj=output_data, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_wordembeddings.py b/src/python/nimbusml/internal/entrypoints/transforms_wordembeddings.py index 25145280..4bd9585e 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_wordembeddings.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_wordembeddings.py @@ -13,7 +13,7 @@ def transforms_wordembeddings( data, output_data=None, model=None, - model_kind='Sswe', + model_kind='SentimentSpecificWordEmbedding', custom_lookup_table=None, **params): """ @@ -58,7 +58,7 @@ def transforms_wordembeddings( 'GloVeTwitter100D', 'GloVeTwitter200D', 'FastTextWikipedia300D', - 'Sswe']) + 'SentimentSpecificWordEmbedding']) if data is not None: inputs['Data'] = try_set( obj=data, diff --git a/src/python/nimbusml/internal/utils/data_roles.py b/src/python/nimbusml/internal/utils/data_roles.py index d3ff8799..f00829b2 100644 --- a/src/python/nimbusml/internal/utils/data_roles.py +++ b/src/python/nimbusml/internal/utils/data_roles.py @@ -66,19 +66,48 @@ class Role: RowId = 'RowId' @staticmethod - def to_attribute(role, suffix="_column"): + def to_attribute(role, suffix="_column_name"): """ Converts a role into an attribute name. - ``GroupId --> group_id_column``. + ``GroupId --> row_group_column_name``. """ if not isinstance(role, str): raise TypeError("Unexpected role '{0}'".format(role)) + if role == "Weight": + return "example_weight" + suffix if role == "GroupId": - return "group_id" + suffix + return "row_group" + suffix if role == "RowId": return "row_id" + suffix return role.lower() + suffix + @staticmethod + def to_parameter(role, suffix="ColumnName"): + """ + Converts a role into (as per manifesrt.json) parameter name. + ``GroupId --> RowGroupColumnName``. + """ + if not isinstance(role, str): + raise TypeError("Unexpected role '{0}'".format(role)) + if role == "Weight": + return "ExampleWeight" + suffix + if role == "GroupId": + return "RowGroup" + suffix + return role + suffix + + @staticmethod + def to_role(column_name, suffix="_column_name"): + """ + Converts an attribute name to role + ``row_group_column_name -> group_id``. + """ + if not isinstance(column_name, str): + raise TypeError("Unexpected column_name '{0}'".format(column_name)) + if column_name == "example_weight" + suffix: + return "weight" + if column_name == "row_group" + suffix: + return "group_id" + return column_name.lower().split(suffix)[0] class DataRoles(Role): """ @@ -91,9 +120,8 @@ class DataRoles(Role): # train and predict. _allowed = set( k for k in Role.__dict__ if k[0] != '_' and k[0].upper() == k[0]) - _allowed_attr = {Role.to_attribute(k): Role.to_attribute( - k, suffix='') for k in Role.__dict__ if - k[0] != '_' and k[0].upper() == k[0]} + _allowed_attr = {Role.to_attribute(k): Role.to_role(k) + for k in Role.__dict__ if k[0] != '_' and k[0].upper() == k[0]} @staticmethod def check_role(role): diff --git a/src/python/nimbusml/internal/utils/data_schema.py b/src/python/nimbusml/internal/utils/data_schema.py index 5faa0f72..a7425267 100644 --- a/src/python/nimbusml/internal/utils/data_schema.py +++ b/src/python/nimbusml/internal/utils/data_schema.py @@ -334,7 +334,7 @@ class DataSchema: exp = Pipeline([ OneHotVectorizer(columns = ['text']), - LightGbmRegressor(min_data_per_leaf = 1) + LightGbmRegressor(minimum_example_count_per_leaf = 1) ]) exp.fit(FileDataStream('data.csv', schema = schema), 'y') diff --git a/src/python/nimbusml/internal/utils/data_stream.py b/src/python/nimbusml/internal/utils/data_stream.py index 8c4ef67f..ede031d9 100644 --- a/src/python/nimbusml/internal/utils/data_stream.py +++ b/src/python/nimbusml/internal/utils/data_stream.py @@ -214,7 +214,7 @@ class FileDataStream(DataStream): #1 2.2 class 3.0 exp = Pipeline([ OneHotVectorizer(columns = ['text']), - LightGbmRegressor(min_data_per_leaf = 1) + LightGbmRegressor(minimum_example_count_per_leaf = 1) ]) exp.fit(ds, 'y') diff --git a/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py b/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py index 02d48768..0b467a37 100644 --- a/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/averagedperceptronbinaryclassifier.py @@ -99,7 +99,7 @@ class AveragedPerceptronBinaryClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'hinge' `. Other choices are :py:class:`'exp' `, :py:class:`'log' @@ -107,31 +107,36 @@ class AveragedPerceptronBinaryClassifier( `. For more information, please see the documentation page about losses, [Loss](xref:nimbusml.loss). - :param learning_rate: Learning rate. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. :param decrease_learning_rate: Decrease learning rate. - :param l2_regularizer_weight: L2 Regularization Weight. + :param l2_regularization: L2 Regularization Weight. - :param num_iterations: Number of iterations. + :param number_of_iterations: Number of iterations. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. :param reset_weights_after_x_examples: Number of examples after which weights will be reset to the current average. - :param do_lazy_updates: Instead of updating averaged weights on every - example, only update when loss is nonzero. + :param lazy_update: Instead of updating averaged weights on every example, + only update when loss is nonzero. :param recency_gain: Extra weight given to more recent updates. - :param recency_gain_multi: Whether Recency Gain is multiplicative (vs. - additive). + :param recency_gain_multiplicative: Whether Recency Gain is multiplicative + (vs. additive). :param averaged: Do averaging?. @@ -141,8 +146,6 @@ class AveragedPerceptronBinaryClassifier( :param shuffle: Whether to shuffle for each training iteration. - :param streaming_cache_size: Size of cache when trained in Scope. - :param params: Additional arguments sent to compute engine. .. seealso:: @@ -165,32 +168,31 @@ def __init__( loss='hinge', learning_rate=1.0, decrease_learning_rate=False, - l2_regularizer_weight=0.0, - num_iterations=1, - init_wts_diameter=0.0, + l2_regularization=0.0, + number_of_iterations=1, + initial_weights_diameter=0.0, reset_weights_after_x_examples=None, - do_lazy_updates=True, + lazy_update=True, recency_gain=0.0, - recency_gain_multi=False, + recency_gain_multiplicative=False, averaged=True, averaged_tolerance=0.01, initial_weights=None, shuffle=True, - streaming_cache_size=1000000, feature=None, label=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, @@ -199,18 +201,17 @@ def __init__( loss=loss, learning_rate=learning_rate, decrease_learning_rate=decrease_learning_rate, - l2_regularizer_weight=l2_regularizer_weight, - num_iterations=num_iterations, - init_wts_diameter=init_wts_diameter, + l2_regularization=l2_regularization, + number_of_iterations=number_of_iterations, + initial_weights_diameter=initial_weights_diameter, reset_weights_after_x_examples=reset_weights_after_x_examples, - do_lazy_updates=do_lazy_updates, + lazy_update=lazy_update, recency_gain=recency_gain, - recency_gain_multi=recency_gain_multi, + recency_gain_multiplicative=recency_gain_multiplicative, averaged=averaged, averaged_tolerance=averaged_tolerance, initial_weights=initial_weights, shuffle=shuffle, - streaming_cache_size=streaming_cache_size, **params) self.feature = feature self.label = label diff --git a/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py b/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py index 9374edd6..4758454b 100644 --- a/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/fastlinearbinaryclassifier.py @@ -70,7 +70,7 @@ class FastLinearBinaryClassifier( optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets - ``shuffle`` to ``False`` and ``train_threads`` to ``1``. + ``shuffle`` to ``False`` and ``number_of_threads`` to ``1``. **Reference** @@ -88,8 +88,10 @@ class FastLinearBinaryClassifier( :param label: see `Columns `_. - :param l2_weight: L2 regularizer constant. By default the l2 constant is - automatically inferred based on data set. + :param weight: see `Columns `_. + + :param l2_regularization: L2 regularizer constant. By default the l2 + constant is automatically inferred based on data set. :param l1_threshold: L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw @@ -118,7 +120,7 @@ class FastLinearBinaryClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'log' `. Other choices are :py:class:`'hinge' `, and @@ -126,7 +128,7 @@ class FastLinearBinaryClassifier( information, please see the documentation page about losses, [Loss](xref:nimbusml.loss). - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. :param positive_instance_weight: Apply weight to the positive class, for @@ -135,14 +137,15 @@ class FastLinearBinaryClassifier( :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. - :param max_iterations: Maximum number of iterations; set to 1 to simulate - online learning. Defaults to automatic. + :param maximum_number_of_iterations: Maximum number of iterations; set to 1 + to simulate online learning. Defaults to automatic. :param shuffle: Shuffle data every epoch?. - :param check_frequency: Convergence check frequency (in terms of number of - iterations). Set as negative or zero for not checking at all. If left - blank, it defaults to check after every 'numThreads' iterations. + :param convergence_check_frequency: Convergence check frequency (in terms + of number of iterations). Set as negative or zero for not checking at + all. If left blank, it defaults to check after every 'numThreads' + iterations. :param bias_learning_rate: The learning rate for adjusting bias from being regularized. @@ -166,50 +169,57 @@ class FastLinearBinaryClassifier( @trace def __init__( self, - l2_weight=None, + l2_regularization=None, l1_threshold=None, normalize='Auto', caching='Auto', loss='log', - train_threads=None, + number_of_threads=None, positive_instance_weight=1.0, convergence_tolerance=0.1, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=0.0, feature=None, label=None, + weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label + if 'example_weight_column_name' in params: + raise NameError( + "'example_weight_column_name' must be renamed to 'weight'") + if weight: + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, - l2_weight=l2_weight, + l2_regularization=l2_regularization, l1_threshold=l1_threshold, normalize=normalize, caching=caching, loss=loss, - train_threads=train_threads, + number_of_threads=number_of_threads, positive_instance_weight=positive_instance_weight, convergence_tolerance=convergence_tolerance, - max_iterations=max_iterations, + maximum_number_of_iterations=maximum_number_of_iterations, shuffle=shuffle, - check_frequency=check_frequency, + convergence_check_frequency=convergence_check_frequency, bias_learning_rate=bias_learning_rate, **params) self.feature = feature self.label = label + self.weight = weight @trace def predict_proba(self, X, **params): diff --git a/src/python/nimbusml/linear_model/fastlinearclassifier.py b/src/python/nimbusml/linear_model/fastlinearclassifier.py index c9546c25..d1ef7644 100644 --- a/src/python/nimbusml/linear_model/fastlinearclassifier.py +++ b/src/python/nimbusml/linear_model/fastlinearclassifier.py @@ -67,7 +67,7 @@ class FastLinearClassifier(core, BasePredictor, ClassifierMixin): optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets ``shuffle`` to - ``False`` and ``train_threads`` to ``1``. + ``False`` and ``number_of_threads`` to ``1``. **Reference** @@ -85,8 +85,10 @@ class FastLinearClassifier(core, BasePredictor, ClassifierMixin): :param label: see `Columns `_. - :param l2_weight: L2 regularizer constant. By default the l2 constant is - automatically inferred based on data set. + :param weight: see `Columns `_. + + :param l2_regularization: L2 regularizer constant. By default the l2 + constant is automatically inferred based on data set. :param l1_threshold: L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw @@ -115,7 +117,7 @@ class FastLinearClassifier(core, BasePredictor, ClassifierMixin): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'log' `. Other choices are @@ -125,20 +127,21 @@ class FastLinearClassifier(core, BasePredictor, ClassifierMixin): documentation page about losses, [Loss](xref:nimbusml.loss). - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. - :param max_iterations: Maximum number of iterations; set to 1 to simulate - online learning. Defaults to automatic. + :param maximum_number_of_iterations: Maximum number of iterations; set to 1 + to simulate online learning. Defaults to automatic. :param shuffle: Shuffle data every epoch?. - :param check_frequency: Convergence check frequency (in terms of number of - iterations). Set as negative or zero for not checking at all. If left - blank, it defaults to check after every 'numThreads' iterations. + :param convergence_check_frequency: Convergence check frequency (in terms + of number of iterations). Set as negative or zero for not checking at + all. If left blank, it defaults to check after every 'numThreads' + iterations. :param bias_learning_rate: The learning rate for adjusting bias from being regularized. @@ -162,48 +165,55 @@ class FastLinearClassifier(core, BasePredictor, ClassifierMixin): @trace def __init__( self, - l2_weight=None, + l2_regularization=None, l1_threshold=None, normalize='Auto', caching='Auto', loss='log', - train_threads=None, + number_of_threads=None, convergence_tolerance=0.1, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=0.0, feature=None, label=None, + weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label + if 'example_weight_column_name' in params: + raise NameError( + "'example_weight_column_name' must be renamed to 'weight'") + if weight: + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, - l2_weight=l2_weight, + l2_regularization=l2_regularization, l1_threshold=l1_threshold, normalize=normalize, caching=caching, loss=loss, - train_threads=train_threads, + number_of_threads=number_of_threads, convergence_tolerance=convergence_tolerance, - max_iterations=max_iterations, + maximum_number_of_iterations=maximum_number_of_iterations, shuffle=shuffle, - check_frequency=check_frequency, + convergence_check_frequency=convergence_check_frequency, bias_learning_rate=bias_learning_rate, **params) self.feature = feature self.label = label + self.weight = weight @trace def predict_proba(self, X, **params): diff --git a/src/python/nimbusml/linear_model/fastlinearregressor.py b/src/python/nimbusml/linear_model/fastlinearregressor.py index 7e180d1c..766a79ae 100644 --- a/src/python/nimbusml/linear_model/fastlinearregressor.py +++ b/src/python/nimbusml/linear_model/fastlinearregressor.py @@ -67,7 +67,7 @@ class FastLinearRegressor(core, BasePredictor, RegressorMixin): optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets ``shuffle`` to - ``False`` and ``train_threads`` to ``1``. + ``False`` and ``number_of_threads`` to ``1``. **Reference** @@ -85,8 +85,10 @@ class FastLinearRegressor(core, BasePredictor, RegressorMixin): :param label: see `Columns `_. - :param l2_weight: L2 regularizer constant. By default the l2 constant is - automatically inferred based on data set. + :param weight: see `Columns `_. + + :param l2_regularization: L2 regularizer constant. By default the l2 + constant is automatically inferred based on data set. :param l1_threshold: L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw @@ -115,26 +117,27 @@ class FastLinearRegressor(core, BasePredictor, RegressorMixin): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The only supported loss is :py:class:`'squared' `. For more information, please see the documentation page about losses, [Loss](xref:nimbusml.loss). - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. :param convergence_tolerance: The tolerance for the ratio between duality gap and primal loss for convergence checking. - :param max_iterations: Maximum number of iterations; set to 1 to simulate - online learning. Defaults to automatic. + :param maximum_number_of_iterations: Maximum number of iterations; set to 1 + to simulate online learning. Defaults to automatic. :param shuffle: Shuffle data every epoch?. - :param check_frequency: Convergence check frequency (in terms of number of - iterations). Set as negative or zero for not checking at all. If left - blank, it defaults to check after every 'numThreads' iterations. + :param convergence_check_frequency: Convergence check frequency (in terms + of number of iterations). Set as negative or zero for not checking at + all. If left blank, it defaults to check after every 'numThreads' + iterations. :param bias_learning_rate: The learning rate for adjusting bias from being regularized. @@ -158,48 +161,55 @@ class FastLinearRegressor(core, BasePredictor, RegressorMixin): @trace def __init__( self, - l2_weight=None, + l2_regularization=None, l1_threshold=None, normalize='Auto', caching='Auto', loss='squared', - train_threads=None, + number_of_threads=None, convergence_tolerance=0.01, - max_iterations=None, + maximum_number_of_iterations=None, shuffle=True, - check_frequency=None, + convergence_check_frequency=None, bias_learning_rate=1.0, feature=None, label=None, + weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label + if 'example_weight_column_name' in params: + raise NameError( + "'example_weight_column_name' must be renamed to 'weight'") + if weight: + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, - l2_weight=l2_weight, + l2_regularization=l2_regularization, l1_threshold=l1_threshold, normalize=normalize, caching=caching, loss=loss, - train_threads=train_threads, + number_of_threads=number_of_threads, convergence_tolerance=convergence_tolerance, - max_iterations=max_iterations, + maximum_number_of_iterations=maximum_number_of_iterations, shuffle=shuffle, - check_frequency=check_frequency, + convergence_check_frequency=convergence_check_frequency, bias_learning_rate=bias_learning_rate, **params) self.feature = feature self.label = label + self.weight = weight def get_params(self, deep=False): """ diff --git a/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py b/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py index 38df685b..1cf29de4 100644 --- a/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/logisticregressionbinaryclassifier.py @@ -119,16 +119,18 @@ class LogisticRegressionBinaryClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param l2_weight: L2 regularization weight. + :param show_training_statistics: Show statistics of training examples. - :param l1_weight: L1 regularization weight. + :param l2_regularization: L2 regularization weight. - :param opt_tol: Tolerance parameter for optimization convergence. Low = - slower, more accurate. + :param l1_regularization: L1 regularization weight. - :param memory_size: Memory size for L-BFGS. Lower=faster, less accurate. + :param optimization_tolerance: Tolerance parameter for optimization + convergence. Low = slower, more accurate. + + :param history_size: Memory size for L-BFGS. Lower=faster, less accurate. The technique used for optimization here is L-BFGS, which uses only a limited amount of memory to compute the next step direction. This parameter indicates the number of past positions and gradients to store @@ -139,23 +141,23 @@ class LogisticRegressionBinaryClassifier( however, does not put any constraint on the bias term; that is, the bias term can be still a negtaive number. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. - :param max_iterations: Maximum iterations. + :param maximum_number_of_iterations: Maximum iterations. - :param sgd_init_tol: Run SGD to initialize LR weights, converging to this - tolerance. + :param stochastic_gradient_descent_initilaization_tolerance: Run SGD to + initialize LR weights, converging to this tolerance. :param quiet: If set to true, produce no output during training. :param use_threads: Whether or not to use threads. Default is true. - :param train_threads: Number of threads. + :param number_of_threads: Number of threads. :param dense_optimizer: If ``True``, forces densification of the internal optimization vectors. If ``False``, enables the logistic regression @@ -183,54 +185,56 @@ def __init__( self, normalize='Auto', caching='Auto', - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + show_training_statistics=False, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_init_tol=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - train_threads=None, + number_of_threads=None, dense_optimizer=False, feature=None, label=None, weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, normalize=normalize, caching=caching, - l2_weight=l2_weight, - l1_weight=l1_weight, - opt_tol=opt_tol, - memory_size=memory_size, + show_training_statistics=show_training_statistics, + l2_regularization=l2_regularization, + l1_regularization=l1_regularization, + optimization_tolerance=optimization_tolerance, + history_size=history_size, enforce_non_negativity=enforce_non_negativity, - init_wts_diameter=init_wts_diameter, - max_iterations=max_iterations, - sgd_init_tol=sgd_init_tol, + initial_weights_diameter=initial_weights_diameter, + maximum_number_of_iterations=maximum_number_of_iterations, + stochastic_gradient_descent_initilaization_tolerance=stochastic_gradient_descent_initilaization_tolerance, quiet=quiet, use_threads=use_threads, - train_threads=train_threads, + number_of_threads=number_of_threads, dense_optimizer=dense_optimizer, **params) self.feature = feature diff --git a/src/python/nimbusml/linear_model/logisticregressionclassifier.py b/src/python/nimbusml/linear_model/logisticregressionclassifier.py index f6ded82f..265adc10 100644 --- a/src/python/nimbusml/linear_model/logisticregressionclassifier.py +++ b/src/python/nimbusml/linear_model/logisticregressionclassifier.py @@ -120,16 +120,18 @@ class LogisticRegressionClassifier( normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param l2_weight: L2 regularization weight. + :param show_training_statistics: Show statistics of training examples. - :param l1_weight: L1 regularization weight. + :param l2_regularization: L2 regularization weight. - :param opt_tol: Tolerance parameter for optimization convergence. Low = - slower, more accurate. + :param l1_regularization: L1 regularization weight. - :param memory_size: Memory size for L-BFGS. Lower=faster, less accurate. + :param optimization_tolerance: Tolerance parameter for optimization + convergence. Low = slower, more accurate. + + :param history_size: Memory size for L-BFGS. Lower=faster, less accurate. The technique used for optimization here is L-BFGS, which uses only a limited amount of memory to compute the next step direction. This parameter indicates the number of past positions and gradients to store @@ -140,23 +142,23 @@ class LogisticRegressionClassifier( however, does not put any constraint on the bias term; that is, the bias term can be still a negtaive number. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. - :param max_iterations: Maximum iterations. + :param maximum_number_of_iterations: Maximum iterations. - :param sgd_init_tol: Run SGD to initialize LR weights, converging to this - tolerance. + :param stochastic_gradient_descent_initilaization_tolerance: Run SGD to + initialize LR weights, converging to this tolerance. :param quiet: If set to true, produce no output during training. :param use_threads: Whether or not to use threads. Default is true. - :param train_threads: Number of threads. + :param number_of_threads: Number of threads. :param dense_optimizer: If ``True``, forces densification of the internal optimization vectors. If ``False``, enables the logistic regression @@ -184,54 +186,56 @@ def __init__( self, normalize='Auto', caching='Auto', - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + show_training_statistics=False, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_init_tol=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - train_threads=None, + number_of_threads=None, dense_optimizer=False, feature=None, label=None, weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, normalize=normalize, caching=caching, - l2_weight=l2_weight, - l1_weight=l1_weight, - opt_tol=opt_tol, - memory_size=memory_size, + show_training_statistics=show_training_statistics, + l2_regularization=l2_regularization, + l1_regularization=l1_regularization, + optimization_tolerance=optimization_tolerance, + history_size=history_size, enforce_non_negativity=enforce_non_negativity, - init_wts_diameter=init_wts_diameter, - max_iterations=max_iterations, - sgd_init_tol=sgd_init_tol, + initial_weights_diameter=initial_weights_diameter, + maximum_number_of_iterations=maximum_number_of_iterations, + stochastic_gradient_descent_initilaization_tolerance=stochastic_gradient_descent_initilaization_tolerance, quiet=quiet, use_threads=use_threads, - train_threads=train_threads, + number_of_threads=number_of_threads, dense_optimizer=dense_optimizer, **params) self.feature = feature diff --git a/src/python/nimbusml/linear_model/onlinegradientdescentregressor.py b/src/python/nimbusml/linear_model/onlinegradientdescentregressor.py index 71796158..d8f76a73 100644 --- a/src/python/nimbusml/linear_model/onlinegradientdescentregressor.py +++ b/src/python/nimbusml/linear_model/onlinegradientdescentregressor.py @@ -71,7 +71,7 @@ class OnlineGradientDescentRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'hinge' `. Other choices are :py:class:`'exp' `, @@ -79,32 +79,37 @@ class OnlineGradientDescentRegressor( `. For more information, please see :py:class:`'loss' `. - :param learning_rate: Learning rate. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. :param decrease_learning_rate: Decrease learning rate. - :param l2_regularizer_weight: L2 Regularization Weight. + :param l2_regularization: L2 Regularization Weight. - :param num_iterations: Number of iterations. + :param number_of_iterations: Number of iterations. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. :param reset_weights_after_x_examples: Number of examples after which weights will be reset to the current average. - :param do_lazy_updates: Instead of updating averaged weights on every - example, only update when loss is nonzero. + :param lazy_update: Instead of updating averaged weights on every example, + only update when loss is nonzero. :param recency_gain: Extra weight given to more recent updates (`do_lazy_updates`` must be **False**). - :param recency_gain_multi: Whether Recency Gain is multiplicative vs. - additive (`do_lazy_updates`` must be **False**). + :param recency_gain_multiplicative: Whether Recency Gain is multiplicative + (vs. additive). :param averaged: Do averaging?. @@ -114,8 +119,6 @@ class OnlineGradientDescentRegressor( :param shuffle: Whether to shuffle for each training iteration. - :param streaming_cache_size: Size of cache when trained in Scope. - :param params: Additional arguments sent to compute engine. .. seealso:: @@ -141,32 +144,31 @@ def __init__( loss='squared', learning_rate=0.1, decrease_learning_rate=True, - l2_regularizer_weight=0.0, - num_iterations=1, - init_wts_diameter=0.0, + l2_regularization=0.0, + number_of_iterations=1, + initial_weights_diameter=0.0, reset_weights_after_x_examples=None, - do_lazy_updates=True, + lazy_update=True, recency_gain=0.0, - recency_gain_multi=False, + recency_gain_multiplicative=False, averaged=True, averaged_tolerance=0.01, initial_weights=None, shuffle=True, - streaming_cache_size=1000000, feature=None, label=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, @@ -175,18 +177,17 @@ def __init__( loss=loss, learning_rate=learning_rate, decrease_learning_rate=decrease_learning_rate, - l2_regularizer_weight=l2_regularizer_weight, - num_iterations=num_iterations, - init_wts_diameter=init_wts_diameter, + l2_regularization=l2_regularization, + number_of_iterations=number_of_iterations, + initial_weights_diameter=initial_weights_diameter, reset_weights_after_x_examples=reset_weights_after_x_examples, - do_lazy_updates=do_lazy_updates, + lazy_update=lazy_update, recency_gain=recency_gain, - recency_gain_multi=recency_gain_multi, + recency_gain_multiplicative=recency_gain_multiplicative, averaged=averaged, averaged_tolerance=averaged_tolerance, initial_weights=initial_weights, shuffle=shuffle, - streaming_cache_size=streaming_cache_size, **params) self.feature = feature self.label = label diff --git a/src/python/nimbusml/linear_model/ordinaryleastsquaresregressor.py b/src/python/nimbusml/linear_model/ordinaryleastsquaresregressor.py index 08d07ac6..585ac2a9 100644 --- a/src/python/nimbusml/linear_model/ordinaryleastsquaresregressor.py +++ b/src/python/nimbusml/linear_model/ordinaryleastsquaresregressor.py @@ -69,11 +69,11 @@ class OrdinaryLeastSquaresRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param l2_weight: L2 regularization weight. + :param l2_regularization: L2 regularization weight. - :param per_parameter_significance: Whether to calculate per parameter + :param calculate_statistics: Whether to calculate per parameter significance statistics. :param params: Additional arguments sent to compute engine. @@ -98,35 +98,35 @@ def __init__( self, normalize='Auto', caching='Auto', - l2_weight=1e-06, - per_parameter_significance=True, + l2_regularization=1e-06, + calculate_statistics=True, feature=None, label=None, weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, normalize=normalize, caching=caching, - l2_weight=l2_weight, - per_parameter_significance=per_parameter_significance, + l2_regularization=l2_regularization, + calculate_statistics=calculate_statistics, **params) self.feature = feature self.label = label diff --git a/src/python/nimbusml/linear_model/poissonregressionregressor.py b/src/python/nimbusml/linear_model/poissonregressionregressor.py index c034f179..6d56f380 100644 --- a/src/python/nimbusml/linear_model/poissonregressionregressor.py +++ b/src/python/nimbusml/linear_model/poissonregressionregressor.py @@ -70,16 +70,16 @@ class PoissonRegressionRegressor( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. - :param l2_weight: L2 regularization weight. + :param l2_regularization: L2 regularization weight. - :param l1_weight: L1 regularization weight. + :param l1_regularization: L1 regularization weight. - :param opt_tol: Tolerance parameter for optimization convergence. Low = - slower, more accurate. + :param optimization_tolerance: Tolerance parameter for optimization + convergence. Low = slower, more accurate. - :param memory_size: Memory size for L-BFGS. Lower=faster, less accurate. + :param history_size: Memory size for L-BFGS. Lower=faster, less accurate. The technique used for optimization here is L-BFGS, which uses only a limited amount of memory to compute the next step direction. This parameter indicates the number of past positions and gradients to store @@ -90,23 +90,23 @@ class PoissonRegressionRegressor( however, does not put any constraint on the bias term; that is, the bias term can be still a negtaive number. - :param init_wts_diameter: Sets the initial weights diameter that specifies - the range from which values are drawn for the initial weights. These - weights are initialized randomly from within this range. For example, - if the diameter is specified to be ``d``, then the weights are - uniformly distributed between ``-d/2`` and ``d/2``. The default value - is ``0``, which specifies that all the weights are set to zero. + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. - :param max_iterations: Maximum iterations. + :param maximum_number_of_iterations: Maximum iterations. - :param sgd_init_tol: Run SGD to initialize LR weights, converging to this - tolerance. + :param stochastic_gradient_descent_initilaization_tolerance: Run SGD to + initialize LR weights, converging to this tolerance. :param quiet: If set to true, produce no output during training. :param use_threads: Whether or not to use threads. Default is true. - :param train_threads: Number of threads. + :param number_of_threads: Number of threads. :param dense_optimizer: If ``True``, forces densification of the internal optimization vectors. If ``False``, enables the logistic regression @@ -139,54 +139,54 @@ def __init__( self, normalize='Auto', caching='Auto', - l2_weight=1.0, - l1_weight=1.0, - opt_tol=1e-07, - memory_size=20, + l2_regularization=1.0, + l1_regularization=1.0, + optimization_tolerance=1e-07, + history_size=20, enforce_non_negativity=False, - init_wts_diameter=0.0, - max_iterations=2147483647, - sgd_init_tol=0.0, + initial_weights_diameter=0.0, + maximum_number_of_iterations=2147483647, + stochastic_gradient_descent_initilaization_tolerance=0.0, quiet=False, use_threads=True, - train_threads=None, + number_of_threads=None, dense_optimizer=False, feature=None, label=None, weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='regressor', **params) core.__init__( self, normalize=normalize, caching=caching, - l2_weight=l2_weight, - l1_weight=l1_weight, - opt_tol=opt_tol, - memory_size=memory_size, + l2_regularization=l2_regularization, + l1_regularization=l1_regularization, + optimization_tolerance=optimization_tolerance, + history_size=history_size, enforce_non_negativity=enforce_non_negativity, - init_wts_diameter=init_wts_diameter, - max_iterations=max_iterations, - sgd_init_tol=sgd_init_tol, + initial_weights_diameter=initial_weights_diameter, + maximum_number_of_iterations=maximum_number_of_iterations, + stochastic_gradient_descent_initilaization_tolerance=stochastic_gradient_descent_initilaization_tolerance, quiet=quiet, use_threads=use_threads, - train_threads=train_threads, + number_of_threads=number_of_threads, dense_optimizer=dense_optimizer, **params) self.feature = feature diff --git a/src/python/nimbusml/linear_model/sgdbinaryclassifier.py b/src/python/nimbusml/linear_model/sgdbinaryclassifier.py index b45e8bf2..a5ee573d 100644 --- a/src/python/nimbusml/linear_model/sgdbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/sgdbinaryclassifier.py @@ -72,7 +72,7 @@ class SgdBinaryClassifier(core, BasePredictor, ClassifierMixin): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param loss: The default is :py:class:`'log' `. Other choices are :py:class:`'exp' `, :py:class:`'hinge' @@ -80,18 +80,18 @@ class SgdBinaryClassifier(core, BasePredictor, ClassifierMixin): `. For more information, please see the documentation page about losses, [Loss](xref:nimbusml.loss). - :param l2_weight: L2 Regularization constant. + :param l2_regularization: L2 Regularization constant. - :param train_threads: Degree of lock-free parallelism. Defaults to + :param number_of_threads: Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed. :param convergence_tolerance: Exponential moving averaged improvement tolerance for convergence. - :param max_iterations: Maximum number of iterations; set to 1 to simulate - online learning. + :param number_of_iterations: Maximum number of iterations; set to 1 to + simulate online learning. - :param init_learning_rate: Initial learning rate (only used by SGD). + :param initial_learning_rate: Initial learning rate (only used by SGD). :param shuffle: Shuffle data every epoch?. @@ -122,11 +122,11 @@ def __init__( normalize='Auto', caching='Auto', loss='log', - l2_weight=1e-06, - train_threads=None, + l2_regularization=1e-06, + number_of_threads=None, convergence_tolerance=0.0001, - max_iterations=20, - init_learning_rate=0.01, + number_of_iterations=20, + initial_learning_rate=0.01, shuffle=True, positive_instance_weight=1.0, check_frequency=None, @@ -135,32 +135,32 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, normalize=normalize, caching=caching, loss=loss, - l2_weight=l2_weight, - train_threads=train_threads, + l2_regularization=l2_regularization, + number_of_threads=number_of_threads, convergence_tolerance=convergence_tolerance, - max_iterations=max_iterations, - init_learning_rate=init_learning_rate, + number_of_iterations=number_of_iterations, + initial_learning_rate=initial_learning_rate, shuffle=shuffle, positive_instance_weight=positive_instance_weight, check_frequency=check_frequency, diff --git a/src/python/nimbusml/linear_model/symsgdbinaryclassifier.py b/src/python/nimbusml/linear_model/symsgdbinaryclassifier.py index 5f5d1e87..afe51ad8 100644 --- a/src/python/nimbusml/linear_model/symsgdbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/symsgdbinaryclassifier.py @@ -73,11 +73,16 @@ class SymSgdBinaryClassifier( and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param number_of_iterations: Number of passes over the data. - :param learning_rate: Learning rate. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. :param l2_regularization: L2 regularization. @@ -139,16 +144,16 @@ def __init__( label=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, diff --git a/src/python/nimbusml/model_selection/cv.py b/src/python/nimbusml/model_selection/cv.py index 532bed87..d719e07f 100644 --- a/src/python/nimbusml/model_selection/cv.py +++ b/src/python/nimbusml/model_selection/cv.py @@ -180,7 +180,7 @@ def _clean_ranking_metrics(metrics): _add_confusion_matrix() elif learner_type == 'multiclass': - self._cv_kind = 'SignatureMultiClassClassifierTrainer' + self._cv_kind = 'SignatureMulticlassClassificationTrainer' self._predictions_columns = [ CV.fold_column_name, 'Instance', diff --git a/src/python/nimbusml/multiclass/onevsrestclassifier.py b/src/python/nimbusml/multiclass/onevsrestclassifier.py index fc9a9abe..238905f1 100644 --- a/src/python/nimbusml/multiclass/onevsrestclassifier.py +++ b/src/python/nimbusml/multiclass/onevsrestclassifier.py @@ -55,7 +55,7 @@ class OneVsRestClassifier(core, BasePredictor, ClassifierMixin): normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param params: Additional arguments sent to compute engine. @@ -110,21 +110,21 @@ def __init__( weight=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label - if 'weight_column' in params: + params['label_column_name'] = label + if 'example_weight_column_name' in params: raise NameError( - "'weight_column' must be renamed to 'weight'") + "'example_weight_column_name' must be renamed to 'weight'") if weight: - params['weight_column'] = weight + params['example_weight_column_name'] = weight BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, diff --git a/src/python/nimbusml/naive_bayes/naivebayesclassifier.py b/src/python/nimbusml/naive_bayes/naivebayesclassifier.py index 5c971595..14a1a83d 100644 --- a/src/python/nimbusml/naive_bayes/naivebayesclassifier.py +++ b/src/python/nimbusml/naive_bayes/naivebayesclassifier.py @@ -67,7 +67,7 @@ class NaiveBayesClassifier(core, BasePredictor, ClassifierMixin): and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves sparsity by mapping zero to zero. - :param caching: Whether learner should cache input training data. + :param caching: Whether trainer should cache input training data. :param params: Additional arguments sent to compute engine. @@ -94,16 +94,16 @@ def __init__( label=None, **params): - if 'feature_column' in params: + if 'feature_column_name' in params: raise NameError( - "'feature_column' must be renamed to 'feature'") + "'feature_column_name' must be renamed to 'feature'") if feature: - params['feature_column'] = feature - if 'label_column' in params: + params['feature_column_name'] = feature + if 'label_column_name' in params: raise NameError( - "'label_column' must be renamed to 'label'") + "'label_column_name' must be renamed to 'label'") if label: - params['label_column'] = label + params['label_column_name'] = label BasePredictor.__init__(self, type='classifier', **params) core.__init__( self, diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 1d286a05..b6f8b9e2 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -31,8 +31,8 @@ models_clusterevaluator from .internal.entrypoints.models_datasettransformer import \ models_datasettransformer -from .internal.entrypoints.models_rankerevaluator import \ - models_rankerevaluator +from .internal.entrypoints.models_rankingevaluator import \ + models_rankingevaluator from .internal.entrypoints.models_regressionevaluator import \ models_regressionevaluator from .internal.entrypoints.models_summarizer import models_summarizer @@ -142,8 +142,8 @@ def clone(self): cloned_steps = [deepcopy(s) for s in self.steps] # Rolls back role manipulation during fitting, - # it removes attribute mapped to roles: label_column, - # feature_column, + # it removes attribute mapped to roles: label_column_name, + # feature_column_name, # ... if len(cloned_steps) > 0: last_node = self.last_node @@ -612,13 +612,13 @@ def _update_graph_nodes_for_learner( if last_node.type != 'transform': # last node is predictor if hasattr( last_node, - 'feature_column') and last_node.feature_column is \ + 'feature_column_name') and last_node.feature_column_name is \ not None: - if isinstance(last_node.feature_column, list): - learner_features = last_node.feature_column - last_node.feature_column = 'Features' + if isinstance(last_node.feature_column_name, list): + learner_features = last_node.feature_column_name + last_node.feature_column_name = 'Features' else: - learner_features = [last_node.feature_column] + learner_features = [last_node.feature_column_name] elif strategy_iosklearn in ("previous", "accumulate"): if hasattr( last_node, @@ -627,16 +627,16 @@ def _update_graph_nodes_for_learner( learner_features = last_node.feature else: learner_features = [last_node.feature] - last_node.feature_column = 'Features' + last_node.feature_column_name = 'Features' elif isinstance(columns_out, list): learner_features = columns_out - last_node.feature_column = 'Features' + last_node.feature_column_name = 'Features' elif columns_out is None: learner_features = ['Features'] - last_node.feature_column = 'Features' + last_node.feature_column_name = 'Features' else: learner_features = [columns_out] - last_node.feature_column = 'Features' + last_node.feature_column_name = 'Features' else: raise NotImplementedError( "Strategy '{0}' to handle unspecified inputs is not " @@ -644,43 +644,43 @@ def _update_graph_nodes_for_learner( strategy_iosklearn)) if label_column is not None or last_node._use_role(Role.Label): - if getattr(last_node, 'label_column_', None): - label_column = last_node.label_column_ - elif getattr(last_node, 'label_column', None): - label_column = last_node.label_column + if getattr(last_node, 'label_column_name_', None): + label_column = last_node.label_column_name_ + elif getattr(last_node, 'label_column_name', None): + label_column = last_node.label_column_name elif label_column: - last_node.label_column = label_column + last_node.label_column_name = label_column elif y is None: if label_column is None: label_column = Role.Label - last_node.label_column = label_column + last_node.label_column_name = label_column else: label_column = _extract_label_column( last_node, DataSchema.read_schema(y)) if label_column is None: label_column = Role.Label - last_node.label_column = label_column + last_node.label_column_name = label_column else: - last_node.label_column = None + last_node.label_column_name = None label_column = None if weight_column is not None or last_node._use_role( Role.Weight): - if getattr(last_node, 'weight_column', None): - weight_column = last_node.weight_column + if getattr(last_node, 'example_weight_column_name', None): + weight_column = last_node.example_weight_column_name elif weight_column: - last_node.weight_column = weight_column + last_node.example_weight_column_name = weight_column else: - last_node.weight_column = None + last_node.example_weight_column_name = None weight_column = None - if (hasattr(last_node, 'group_id_column_') - and last_node.group_id_column_ is not None): - group_id_column = last_node.group_id_column_ + if (hasattr(last_node, 'row_group_column_name_') + and last_node.row_group_column_name_ is not None): + group_id_column = last_node.row_group_column_name_ elif (hasattr(last_node, - 'group_id_column') and - last_node.group_id_column is not None): - group_id_column = last_node.group_id_column + 'row_group_column_name') and + last_node.row_group_column_name is not None): + group_id_column = last_node.row_group_column_name else: group_id_column = None @@ -705,12 +705,12 @@ def _update_graph_nodes_for_learner( # node to # use suplied vars learner_node = last_node._get_node( - feature_column=learner_features, + feature_column_name=learner_features, training_data=output_data, predictor_model=predictor_model, - label_column=label_column, - weight_column=weight_column, - group_id_column=group_id_column) + label_column_name=label_column, + example_weight_column_name=weight_column, + row_group_column_name=group_id_column) graph_nodes['learner_node'] = [learner_node] return graph_nodes, learner_node, learner_features else: @@ -924,7 +924,7 @@ def process_input_output(classname, node, input_schema): else: assigned = [] for role in sorted(DataRoles._allowed): - attr = role + 'Column' + attr = DataRoles.to_parameter(role) if attr in inp: assigned.append(inp[attr]) assigned = set(assigned) @@ -932,9 +932,9 @@ def process_input_output(classname, node, input_schema): col for col in input_schema if col not in assigned] for role in sorted(DataRoles._allowed): - attr = role + 'Column' + attr = DataRoles.to_parameter(role) if attr in inp: - if attr == 'FeatureColumn' and inp[attr]\ + if attr == 'FeatureColumnName' and inp[attr]\ not in input_schema: val = not_assigned else: @@ -1295,7 +1295,7 @@ def _process_transformers(self, input_data, input_columns, output_data, node = step._get_node(data=data_in, input=columns_in, output_data=data_out, output=columns_out, model=model_out, - label_column=label_column) + label_column_name=label_column) if isinstance(node, list): # In most cases, _get_node returns only one entrypoint # mapped to the current step. In rare cases, the python @@ -1463,7 +1463,7 @@ def _evaluation(self, evaltype, group_id, **params): column = [OrderedDict(Source=group_id, Name=group_id)] algo_args = dict(data=svd, output_data=svd, column=column) key_node = transforms_texttokeyconverter(**algo_args) - evaluate_node = models_rankerevaluator( + evaluate_node = models_rankingevaluator( group_id_column=group_id, **params) all_nodes.extend([ key_node, @@ -1959,7 +1959,7 @@ def test( raise ValueError( "Pipeline needs a trainer as last step for test()") if y is None: - y = self.last_node.label_column_ + y = self.last_node.label_column_name_ elif y is None: raise ValueError(errmsg) @@ -1975,8 +1975,8 @@ def test( group_id = group_id if group_id is not None else inputs.get( Role.GroupId) if group_id is None: - if hasattr(last_node, 'group_id_column_'): - group_id = last_node.group_id_column_ + if hasattr(last_node, 'row_group_column_name_'): + group_id = last_node.row_group_column_name_ # if model was loaded using load_model, no nodes present except TypeError: pass diff --git a/src/python/nimbusml/preprocessing/filter/skipfilter.py b/src/python/nimbusml/preprocessing/filter/skipfilter.py index 73b9c332..6c7e15fb 100644 --- a/src/python/nimbusml/preprocessing/filter/skipfilter.py +++ b/src/python/nimbusml/preprocessing/filter/skipfilter.py @@ -52,7 +52,7 @@ class SkipFilter(core, BaseTransform, TransformerMixin): @trace def __init__( self, - count=0, + count, columns=None, **params): diff --git a/src/python/nimbusml/preprocessing/filter/takefilter.py b/src/python/nimbusml/preprocessing/filter/takefilter.py index 6fe9722d..9b8d013c 100644 --- a/src/python/nimbusml/preprocessing/filter/takefilter.py +++ b/src/python/nimbusml/preprocessing/filter/takefilter.py @@ -52,7 +52,7 @@ class TakeFilter(core, BaseTransform, TransformerMixin): @trace def __init__( self, - count=9223372036854775807, + count, columns=None, **params): diff --git a/src/python/nimbusml/preprocessing/tensorflowscorer.py b/src/python/nimbusml/preprocessing/tensorflowscorer.py index 5aae80b4..c1e0caf2 100644 --- a/src/python/nimbusml/preprocessing/tensorflowscorer.py +++ b/src/python/nimbusml/preprocessing/tensorflowscorer.py @@ -47,8 +47,6 @@ class TensorFlowScorer(core, BaseTransform, TransformerMixin): * The name of each output column should match one of the operations in the Tensorflow graph. - :param label: see `Columns `_. - :param columns: see `Columns `_. :param model_location: TensorFlow model used by the transform. Please see @@ -58,6 +56,8 @@ class TensorFlowScorer(core, BaseTransform, TransformerMixin): :param output_columns: The name of the outputs. + :param label_column: Training labels. + :param tensor_flow_label: TensorFlow label node. :param optimization_operation: The name of the optimization operation in @@ -76,7 +76,12 @@ class TensorFlowScorer(core, BaseTransform, TransformerMixin): :param learning_rate_operation: The name of the operation in the TensorFlow graph which sets optimizer learning rate (Optional). - :param learning_rate: Learning rate to use during optimization. + :param learning_rate: Determines the size of the step taken in the + direction of the gradient in each step of the learning process. This + determines how fast or slow the learner converges on the optimal + solution. If the step size is too big, you might overshoot the optimal + solution. If the step size is too small, training takes longer to + converge to the best solution. :param save_location_operation: Name of the input in TensorFlow graph that specifiy the location for saving/restoring models from disk. @@ -86,6 +91,9 @@ class TensorFlowScorer(core, BaseTransform, TransformerMixin): :param re_train: Retrain TensorFlow model. + :param add_batch_dimension_inputs: Add a batch dimension to the input e.g. + input = [224, 224, 3] => [-1, 224, 224, 3]. + :param params: Additional arguments sent to compute engine. .. index:: transform @@ -101,6 +109,7 @@ def __init__( model_location, input_columns=None, output_columns=None, + label_column=None, tensor_flow_label=None, optimization_operation=None, loss_operation=None, @@ -112,15 +121,10 @@ def __init__( save_location_operation='save/Const', save_operation='save/control_dependency', re_train=False, - label=None, + add_batch_dimension_inputs=False, columns=None, **params): - if 'label_column' in params: - raise NameError( - "'label_column' must be renamed to 'label'") - if label: - params['label_column'] = label if columns: params['columns'] = columns if columns: @@ -140,6 +144,7 @@ def __init__( model_location=model_location, input_columns=input_columns, output_columns=output_columns, + label_column=label_column, tensor_flow_label=tensor_flow_label, optimization_operation=optimization_operation, loss_operation=loss_operation, @@ -151,8 +156,8 @@ def __init__( save_location_operation=save_location_operation, save_operation=save_operation, re_train=re_train, + add_batch_dimension_inputs=add_batch_dimension_inputs, **params) - self.label = label self._columns = columns def get_params(self, deep=False): diff --git a/src/python/nimbusml/preprocessing/tokey.py b/src/python/nimbusml/preprocessing/tokey.py index 3113e173..97c00ad3 100644 --- a/src/python/nimbusml/preprocessing/tokey.py +++ b/src/python/nimbusml/preprocessing/tokey.py @@ -48,7 +48,7 @@ class ToKey(core, BaseTransform, TransformerMixin): For more details see `Columns `_. - :param max_num_terms: Maximum number of terms to keep per column when auto- + :param max_num_terms: Maximum number of keys to keep per column when auto- training. :param term: List of terms. @@ -84,7 +84,7 @@ def __init__( self, max_num_terms=1000000, term=None, - sort='Occurrence', + sort='ByOccurrence', text_key_values=False, columns=None, **params): diff --git a/src/python/nimbusml/tests/data_type/test_numeric.py b/src/python/nimbusml/tests/data_type/test_numeric.py index 9406708d..8456985b 100644 --- a/src/python/nimbusml/tests/data_type/test_numeric.py +++ b/src/python/nimbusml/tests/data_type/test_numeric.py @@ -32,7 +32,7 @@ def train_data_type_single( data = [[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 2, 2]] label = [1, 0, 1, 1] if fit_X_type == "sparse": - model = LightGbmClassifier(min_data_per_leaf=1) + model = LightGbmClassifier(minimum_example_count_per_leaf=1) else: model = LogisticRegressionBinaryClassifier() data_with_new_type = transform_data(data, fit_X_type) @@ -46,7 +46,7 @@ def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None): data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [1.0, 2.0, 3.0], [2.0, 2.0, 2.0]] label = [1, 0, 1, 1] if fit_X_type == "sparse": - model = Pipeline([Binner(), LightGbmClassifier(min_data_per_leaf=1)]) + model = Pipeline([Binner(), LightGbmClassifier(minimum_example_count_per_leaf=1)]) else: model = Pipeline([Binner(), LogisticRegressionBinaryClassifier()]) data_with_new_type = transform_data(data, fit_X_type) diff --git a/src/python/nimbusml/tests/data_type/test_text.py b/src/python/nimbusml/tests/data_type/test_text.py index 802459d0..fbc9b281 100644 --- a/src/python/nimbusml/tests/data_type/test_text.py +++ b/src/python/nimbusml/tests/data_type/test_text.py @@ -50,7 +50,7 @@ def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None): label = [1, 0, 1, 1] model = Pipeline([ NGramFeaturizer(), - LightGbmClassifier(min_data_per_leaf=1, n_thread=1) + LightGbmClassifier(minimum_example_count_per_leaf=1, number_of_threads=1) ]) data_with_new_type = transform_data(data, fit_X_type) label_with_new_type = transform_data(label, fit_Y_type) @@ -121,72 +121,72 @@ def test_check_text_datatype_ppl_series_list_array(self): result, scores, metrics = train_data_type_ppl( "series", "list", "array") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) def test_check_text_datatype_ppl_list_series_dataframe(self): result, scores, metrics = train_data_type_ppl( "list", "series", "dataframe") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) def test_check_text_datatype_ppl_list_list_series(self): result, scores, metrics = train_data_type_ppl("list", "list", "series") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) def test_check_text_datatype_ppl_array_series_array(self): result, scores, metrics = train_data_type_ppl( "array", "series", "array") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) def test_check_text_datatype_ppl_series_array_dataframe(self): result, scores, metrics = train_data_type_ppl( "series", "array", "dataframe") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) def test_check_text_datatype_ppl_array_series_list(self): result, scores, metrics = train_data_type_ppl( "array", "series", "list") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) def test_check_text_datatype_ppl_dataframe_list_series(self): result, scores, metrics = train_data_type_ppl( "dataframe", "list", "series") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) def test_check_text_datatype_ppl_series_series_dataframe(self): result, scores, metrics = train_data_type_ppl( "series", "series", "dataframe") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) def test_check_text_datatype_ppl_dataframe_series_series(self): result, scores, metrics = train_data_type_ppl( "dataframe", "series", "series") assert len(result) == 4 - assert_almost_equal(metrics['Log-loss'].item(), 0.4402459) + assert_almost_equal(metrics['Log-loss'].item(), 0.56233514) assert_array_equal(scores['Score.0'].values, result['Score.0'].values) - assert_array_almost_equal(scores['Score.0'].values, [0.359195, 0.528997, 0.214895, 0.354186]) + assert_array_almost_equal(scores['Score.0'].values, [0.25, 0.25, 0.25, 0.25]) if __name__ == '__main__': diff --git a/src/python/nimbusml/tests/ensemble/test_fasttreesbinaryclassifier.py b/src/python/nimbusml/tests/ensemble/test_fasttreesbinaryclassifier.py index 98c4927a..f315a97c 100644 --- a/src/python/nimbusml/tests/ensemble/test_fasttreesbinaryclassifier.py +++ b/src/python/nimbusml/tests/ensemble/test_fasttreesbinaryclassifier.py @@ -27,7 +27,7 @@ def test_default_label(self): "Petal_Length", "Sepal_Length"]}, FastTreesBinaryClassifier( - num_trees=2) << { + number_of_trees=2) << { Role.Label: 'Label', Role.Feature: 'Features'}]) @@ -38,7 +38,7 @@ def test_default_label(self): pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"]}, - FastTreesBinaryClassifier(num_trees=2) << { + FastTreesBinaryClassifier(number_of_trees=2) << { Role.Feature: 'Features'} ]) @@ -50,7 +50,7 @@ def test_default_label(self): pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"]}, - FastTreesBinaryClassifier(num_trees=2) + FastTreesBinaryClassifier(number_of_trees=2) ]) model = pipeline.fit(df, verbose=0) @@ -61,7 +61,7 @@ def test_default_label(self): pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"]}, - FastTreesBinaryClassifier(num_trees=2) << {Role.Label: 'Label'} + FastTreesBinaryClassifier(number_of_trees=2) << {Role.Label: 'Label'} ]) model = pipeline.fit(df, verbose=0) diff --git a/src/python/nimbusml/tests/ensemble/test_lightgbmranker.py b/src/python/nimbusml/tests/ensemble/test_lightgbmranker.py index c14fa26e..483522d4 100644 --- a/src/python/nimbusml/tests/ensemble/test_lightgbmranker.py +++ b/src/python/nimbusml/tests/ensemble/test_lightgbmranker.py @@ -45,22 +45,22 @@ def test_lightgbmranker_asfilestream(self): metrics, _ = pipeline.test(eval_stream) assert_almost_equal( metrics['NDCG@1'][0], - 43.571429, - decimal=5, + 0.43571429, + decimal=7, err_msg="NDCG@1 should be %s" % - 43.571429) + 0.43571429) assert_almost_equal( metrics['NDCG@2'][0], - 51.28226, - decimal=5, + 0.5128226, + decimal=7, err_msg="NDCG@2 should be %s" % - 51.28226) + 0.5128226) assert_almost_equal( metrics['NDCG@3'][0], - 55.168069, - decimal=5, + 0.55168069, + decimal=7, err_msg="NDCG@3 should be %s" % - 55.168069) + 0.55168069) assert_almost_equal( metrics['DCG@1'][0], 4.688759, @@ -97,22 +97,22 @@ def test_lightgbmranker_asdataframe(self): metrics, _ = e.test(df) assert_almost_equal( metrics['NDCG@1'][0], - 43.571429, - decimal=5, + 0.43571429, + decimal=7, err_msg="NDCG@1 should be %s" % - 43.571429) + 0.43571429) assert_almost_equal( metrics['NDCG@2'][0], - 51.28226, - decimal=5, + 0.5128226, + decimal=7, err_msg="NDCG@2 should be %s" % - 51.28226) + 0.5128226) assert_almost_equal( metrics['NDCG@3'][0], - 55.168069, - decimal=5, + 0.55168069, + decimal=7, err_msg="NDCG@3 should be %s" % - 55.168069) + 0.55168069) assert_almost_equal( metrics['DCG@1'][0], 4.688759, @@ -149,22 +149,22 @@ def test_lightgbmranker_asdataframe_groupid(self): metrics, _ = e.test(df) assert_almost_equal( metrics['NDCG@1'][0], - 43.571429, - decimal=5, + 0.43571429, + decimal=7, err_msg="NDCG@1 should be %s" % - 43.571429) + 0.43571429) assert_almost_equal( metrics['NDCG@2'][0], - 51.28226, - decimal=5, + 0.5128226, + decimal=7, err_msg="NDCG@2 should be %s" % - 51.28226) + 0.5128226) assert_almost_equal( metrics['NDCG@3'][0], - 55.168069, - decimal=5, + 0.55168069, + decimal=7, err_msg="NDCG@3 should be %s" % - 55.168069) + 0.55168069) assert_almost_equal( metrics['DCG@1'][0], 4.688759, @@ -212,22 +212,22 @@ def test_lightgbmranker_asfilestream_evaltyperanking(self): metrics, _ = pipeline.test(eval_stream) assert_almost_equal( metrics['NDCG@1'][0], - 43.571429, - decimal=5, + 0.43571429, + decimal=7, err_msg="NDCG@1 should be %s" % - 43.571429) + 0.43571429) assert_almost_equal( metrics['NDCG@2'][0], - 51.28226, - decimal=5, + 0.5128226, + decimal=7, err_msg="NDCG@2 should be %s" % - 51.28226) + 0.5128226) assert_almost_equal( metrics['NDCG@3'][0], - 55.168069, - decimal=5, + 0.55168069, + decimal=7, err_msg="NDCG@3 should be %s" % - 55.168069) + 0.55168069) assert_almost_equal( metrics['DCG@1'][0], 4.688759, diff --git a/src/python/nimbusml/tests/feature_extraction/categorical/test_onehothashvectorizer.py b/src/python/nimbusml/tests/feature_extraction/categorical/test_onehothashvectorizer.py index 2e2d90ce..61b424a6 100644 --- a/src/python/nimbusml/tests/feature_extraction/categorical/test_onehothashvectorizer.py +++ b/src/python/nimbusml/tests/feature_extraction/categorical/test_onehothashvectorizer.py @@ -23,7 +23,7 @@ def test_numeric_columns(self): 'edu': 'education', 'in': 'induced', 'sp': 'spontaneous'}, - hash_bits=2) + number_of_bits=2) xf.fit_transform(data) xf = OneHotHashVectorizer( @@ -31,7 +31,7 @@ def test_numeric_columns(self): 'education', 'induced', 'spontaneous'], - hash_bits=2) + number_of_bits=2) xf.fit_transform(data) diff --git a/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py b/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py index 4e66a667..31d46f9a 100644 --- a/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py +++ b/src/python/nimbusml/tests/feature_extraction/text/test_wordembedding.py @@ -84,14 +84,15 @@ def test_word_embedding_example(self): # TODO: Bug 149666 # TODO: Bug 149700 pipeline = Pipeline([ - NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, + NGramFeaturizer(word_feature_extractor=Ngram(), + output_tokens_column_name='features_TransformedText', columns={'features': ['id', 'education']}), WordEmbedding(columns='features_TransformedText') ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 802) + assert features.shape == (248, 787) # TODO: fix ssl issue on test centos7 & ubuntu14 boxes. # Test works on ubuntu16. @@ -120,14 +121,15 @@ def test_word_embedding_example2(self): data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ - NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, + NGramFeaturizer(word_feature_extractor=Ngram(), + output_tokens_column_name='features_TransformedText', columns={'features': ['id', 'education']}), WordEmbedding(columns='features_TransformedText') ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 802) + assert features.shape == (248, 787) assert 'features_TransformedText.94' in list(features.columns) # TODO: fix ssl issue on test centos7 & ubuntu14 boxes. @@ -156,7 +158,7 @@ def test_word_embedding_example_dict_same_name(self): 'col=spontaneous:R4:6 header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ - NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, + NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='features_TransformedText', columns={'features': ['id', 'education']}), # What is features_TransformedText? @@ -166,7 +168,7 @@ def test_word_embedding_example_dict_same_name(self): ]) features = pipeline.fit_transform(data) - assert features.shape == (248, 802) + assert features.shape == (248, 787) @unittest.skip('System.ArgumentOutOfRangeException') def test_word_embedding_example_dict_newname(self): @@ -176,7 +178,8 @@ def test_word_embedding_example_dict_newname(self): 'col=spontaneous:R4:6 header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ - NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, + NGramFeaturizer(word_feature_extractor=Ngram(), + output_tokens_column_name='features_TransformedText', columns={'features': ['id', 'education']}), # What is features_TransformedText? diff --git a/src/python/nimbusml/tests/feature_selection/test_mutualinformationselector.py b/src/python/nimbusml/tests/feature_selection/test_mutualinformationselector.py index 5d5586ec..db907fd7 100644 --- a/src/python/nimbusml/tests/feature_selection/test_mutualinformationselector.py +++ b/src/python/nimbusml/tests/feature_selection/test_mutualinformationselector.py @@ -69,7 +69,7 @@ def test_example_success(self): Role.Feature: [ 'x1', 'x2'], Role.Label: 'like'} assert transform_2._allowed_roles == {'Label'} - assert transform_2.label_column == 'like' + assert transform_2.label_column_name == 'like' assert transform_2.input == ['x1', 'x2'] assert transform_2.output == ['Feature'] exp = Pipeline([transform_2]) @@ -79,7 +79,7 @@ def test_example_success(self): transform_2 = MutualInformationSelector( ) << {"zoo": ['x1', 'x2'], Role.Label: 'like'} assert transform_2._allowed_roles == {'Label'} - assert transform_2.label_column == 'like' + assert transform_2.label_column_name == 'like' assert transform_2.input == ['x1', 'x2'] assert transform_2.output == ['zoo'] exp = Pipeline([transform_2]) @@ -89,7 +89,7 @@ def test_example_success(self): transform_2 = MutualInformationSelector() << { "zoo": ['x1'], Role.Label: 'like'} assert transform_2._allowed_roles == {'Label'} - assert transform_2.label_column == 'like' + assert transform_2.label_column_name == 'like' assert transform_2.input == ['x1'] assert transform_2.output == ['zoo'] exp = Pipeline([transform_2]) @@ -99,7 +99,7 @@ def test_example_success(self): transform_2 = MutualInformationSelector( slots_in_output=1, columns=['x1'], label='like') assert transform_2._allowed_roles == {'Label'} - assert transform_2.label_column == 'like' + assert transform_2.label_column_name == 'like' assert transform_2.input == ['x1'] assert transform_2.output == ['x1'] pipe = Pipeline([transform_2]) @@ -152,7 +152,7 @@ def test_example_fails(self): slots_in_output=1, feature=[ 'x1', 'x2'], label='like') assert transform_2._allowed_roles == {'Label'} - assert transform_2.label_column == 'like' + assert transform_2.label_column_name == 'like' # assert transform_2.input == ['x1', 'x2'] # None # assert transform_2.output == ['Feature'] # None pipe = Pipeline([transform_2]) diff --git a/src/python/nimbusml/tests/idv/test_idv.py b/src/python/nimbusml/tests/idv/test_idv.py index 39ca538b..e86f2226 100644 --- a/src/python/nimbusml/tests/idv/test_idv.py +++ b/src/python/nimbusml/tests/idv/test_idv.py @@ -20,6 +20,21 @@ sep=',', numeric_dtype=np.float32) # Error with integer input +def is_nan(x): + return (x is np.nan or x != x) + +def assert_2d_array_equal(actual, desired): + if len(actual) != len(desired): + assert_true(False, "arrays are of different lengths.") + + for i in range(len(actual)): + if len(actual[i]) != len(desired[i]): + assert_true(False, "arrays are of different lengths.") + for y in range(len(actual[i])): + if is_nan(actual[i][y]) and is_nan(desired[i][y]): + continue + assert_true(actual[i][y] == desired[i][y]) + def transform_data(): xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'}) @@ -40,7 +55,7 @@ def test_fit_transform(self): assert_array_equal( transformed_data_as_df.columns, transformed_data_df.columns) - assert_array_equal( + assert_2d_array_equal( transformed_data_as_df.values, transformed_data_df.values) diff --git a/src/python/nimbusml/tests/linear_model/test_symsgdbinaryclassifier.py b/src/python/nimbusml/tests/linear_model/test_symsgdbinaryclassifier.py index 2d96a517..fcf0561d 100644 --- a/src/python/nimbusml/tests/linear_model/test_symsgdbinaryclassifier.py +++ b/src/python/nimbusml/tests/linear_model/test_symsgdbinaryclassifier.py @@ -4,6 +4,7 @@ # -------------------------------------------------------------------------------------------- import unittest +import os import numpy as np from nimbusml.datasets import get_dataset @@ -15,7 +16,7 @@ class TestSymSgdBinaryClassifier(unittest.TestCase): - @unittest.skip("BUG: Not included in ML.NET yet") + @unittest.skipIf(os.name != "nt", "BUG: SymSgd lib fails to load on Linux") def test_SymSgdBinaryClassifier(self): np.random.seed(0) df = get_dataset("infert").as_df() diff --git a/src/python/nimbusml/tests/metrics/test_metrics.py b/src/python/nimbusml/tests/metrics/test_metrics.py index 274e1ebb..9dc02f68 100644 --- a/src/python/nimbusml/tests/metrics/test_metrics.py +++ b/src/python/nimbusml/tests/metrics/test_metrics.py @@ -80,10 +80,10 @@ def test_metrics_evaluate_binary(self): 0.686) assert_almost_equal( metrics['Log-loss reduction'][0], - 30.05, - decimal=1, + 0.3005, + decimal=3, err_msg="Log-loss reduction should be %s" % - 30.05) + 0.3005) assert_almost_equal( metrics['Test-set entropy (prior Log-Loss/instance)'][0], 0.981, @@ -136,10 +136,10 @@ def test_metrics_evaluate_multiclass(self): 0.419) assert_almost_equal( metrics['Log-loss reduction'][0], - 38.476, - decimal=1, + 0.38476, + decimal=3, err_msg="Log-loss reduction should be %s" % - 38.476) + 0.38476) assert_almost_equal( metrics['(class 0)'][0], 0.223, @@ -193,7 +193,7 @@ def test_metrics_evaluate_clusterer(self): X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) - lr = KMeansPlusPlus(n_clusters=2, init_algorithm="Random") + lr = KMeansPlusPlus(n_clusters=2, initialization_algorithm="Random") e = Pipeline([lr]) e.fit(X_train, y_train.to_frame(), verbose=0) metrics, _ = e.test(X_test, y_test) @@ -229,9 +229,9 @@ def test_metrics_evaluate_anomalydetection(self): svm = OneClassSvmAnomalyDetector() # noqa e = Pipeline([svm]) e.fit(X_train, verbose=0) - if e.nodes[-1].label_column_ is not None: + if e.nodes[-1].label_column_name_ is not None: raise ValueError("'{0}' should be None".format( - e.nodes[-1].label_column_)) + e.nodes[-1].label_column_name_)) assert y_test.name == 'Setosa' metrics, _ = e.test(X_test, y_test) assert_almost_equal( @@ -306,22 +306,22 @@ def test_metrics_evaluate_ranking_group_id_from_new_dataframe(self): X_test, y_test, evaltype='ranking', group_id=groups_df) assert_almost_equal( metrics['NDCG@1'][0], - 100, + 1, decimal=5, err_msg="NDCG@1 should be %s" % - 100) + 1) assert_almost_equal( metrics['NDCG@2'][0], - 100, + 1, decimal=5, - err_msg="NDCG@1 should be %s" % - 100) + err_msg="NDCG@2 should be %s" % + 1) assert_almost_equal( metrics['NDCG@3'][0], - 100, + 1, decimal=5, - err_msg="NDCG@1 should be %s" % - 100) + err_msg="NDCG@3 should be %s" % + 1) # TODO: JRP comment for now. Debug fluctuations on build server # assert_almost_equal(metrics['DCG@1'][0], 4.32808, decimal=3, # err_msg="DCG@1 should be %s" % 4.32808) @@ -359,22 +359,22 @@ def test_metrics_evaluate_ranking_group_id_from_existing_column_in_X(self): X_test, y_test, evaltype='ranking', group_id='group_id') assert_almost_equal( metrics['NDCG@1'][0], - 100, + 1, decimal=5, err_msg="NDCG@1 should be %s" % - 100) + 1) assert_almost_equal( metrics['NDCG@2'][0], - 100, + 1, decimal=5, - err_msg="NDCG@1 should be %s" % - 100) + err_msg="NDCG@2 should be %s" % + 1) assert_almost_equal( metrics['NDCG@3'][0], - 100, + 1, decimal=5, - err_msg="NDCG@1 should be %s" % - 100) + err_msg="NDCG@3 should be %s" % + 1) assert_almost_equal( metrics['DCG@1'][0], 4.32808, @@ -400,7 +400,7 @@ def test_metrics_evaluate_binary_from_filedatastream(self): e = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LightGbmRegressor(feature=['induced', 'edu'], label='age', - n_thread=1) + number_of_threads=1) ]) e.fit(data, verbose=0) metrics, _ = e.test(data) diff --git a/src/python/nimbusml/tests/model_selection/test_cv.py b/src/python/nimbusml/tests/model_selection/test_cv.py index 6006ba94..2f264de2 100644 --- a/src/python/nimbusml/tests/model_selection/test_cv.py +++ b/src/python/nimbusml/tests/model_selection/test_cv.py @@ -404,7 +404,7 @@ def check_cv_with_defaults2( steps = [ToKey() << { group_id: group_id}, ColumnConcatenator() << { 'Features': [features]}, LightGbmRanker( - min_data_per_leaf=1) << { + minimum_example_count_per_leaf=1) << { Role.GroupId: group_id}] data = self.data_wt_rename(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params) @@ -420,7 +420,7 @@ def check_cv_with_defaults_df( ToKey() << { group_id: group_id}, LightGbmRanker( - min_data_per_leaf=1, + minimum_example_count_per_leaf=1, feature=features, label='rank', group_id='group' )] @@ -474,7 +474,7 @@ def check_cv_with_defaults( group_id: group_id}, # even specify all the roles needed in the following line, the # roles are still not passed correctly - LightGbmRanker(min_data_per_leaf=1) << { + LightGbmRanker(minimum_example_count_per_leaf=1) << { Role.GroupId: group_id, Role.Feature: features, Role.Label: label_name}] data = self.data(label_name, group_id, features) diff --git a/src/python/nimbusml/tests/model_selection/test_sweep.py b/src/python/nimbusml/tests/model_selection/test_sweep.py index 5d4be8d9..5a5f0b32 100644 --- a/src/python/nimbusml/tests/model_selection/test_sweep.py +++ b/src/python/nimbusml/tests/model_selection/test_sweep.py @@ -42,30 +42,30 @@ class TestSweep(unittest.TestCase): def test_hyperparameters_sweep(self): # general test with combination of named and unnamed steps np.random.seed(0) - df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], - workclass=['X', 'X', 'Y', 'Y', 'Y'], - y=[1, 0, 1, 0, 0])) + df = pd.DataFrame(dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], + workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], + y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] pipe = Pipeline([ ('cat', OneHotVectorizer() << 'education'), # unnamed step, stays same in grid search OneHotHashVectorizer() << 'workclass', - # num_trees 0 will actually be never run by grid search - ('learner', FastTreesBinaryClassifier(num_trees=0, num_leaves=2)) + # number_of_trees 0 will actually be never run by grid search + ('learner', FastTreesBinaryClassifier(number_of_trees=0, number_of_leaves=2)) ]) param_grid = dict( cat__output_kind=[ - 'Ind', 'Bin'], learner__num_trees=[ + 'Indicator', 'Binary'], learner__number_of_trees=[ 1, 2, 3]) grid = GridSearchCV(pipe, param_grid) grid.fit(X, y) print(grid.best_params_) assert grid.best_params_ == { - 'cat__output_kind': 'Ind', - 'learner__num_trees': 1} + 'cat__output_kind': 'Indicator', + 'learner__number_of_trees': 1} def test_learners_sweep(self): # grid search over 2 learners, even though pipe defined with @@ -74,9 +74,9 @@ def test_learners_sweep(self): # over it np.random.seed(0) - df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], - workclass=['X', 'X', 'Y', 'Y', 'Y'], - y=[1, 0, 1, 0, 0])) + df = pd.DataFrame(dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], + workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], + y=[1, 0, 1, 1, 0, 1, 0])) X = df.drop('y', axis=1) y = df['y'] @@ -88,7 +88,7 @@ def test_learners_sweep(self): learner=[ FastLinearBinaryClassifier(), FastTreesBinaryClassifier()], - learner__train_threads=[ + learner__number_of_threads=[ 1, 4]) grid = GridSearchCV(pipe, param_grid) @@ -96,13 +96,13 @@ def test_learners_sweep(self): grid.fit(X, y) assert grid.best_params_[ 'learner'].__class__.__name__ == 'FastLinearBinaryClassifier' - assert grid.best_params_['learner__train_threads'] == 1 + assert grid.best_params_['learner__number_of_threads'] == 1 @unittest.skipIf( six.PY2, "potential bug in pandas read_csv of unicode text in python2.7") def test_uciadult_sweep(self): - # grid search over num_trees and then confirm the best num_trees by + # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) (X_train, y_train) = get_X_y(train_file, @@ -111,27 +111,27 @@ def test_uciadult_sweep(self): label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns - # num_trees 100 will actually be never run by grid search + # number_of_trees 100 will actually be never run by grid search # as its not in param_grid below - learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5) + learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) - param_grid = dict(learner__num_trees=[1, 5, 10]) + param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) - assert grid.best_params_['learner__num_trees'] == 10 + assert grid.best_params_['learner__number_of_trees'] == 10 - # compare AUC on num_trees 1, 5, 10 - pipe.set_params(learner__num_trees=1) + # compare AUC on number_of_trees 1, 5, 10 + pipe.set_params(learner__number_of_trees=1) pipe.fit(X_train, y_train) metrics1, _ = pipe.test(X_train, y_train) - pipe.set_params(learner__num_trees=5) + pipe.set_params(learner__number_of_trees=5) pipe.fit(X_train, y_train) metrics5, _ = pipe.test(X_train, y_train) - pipe.set_params(learner__num_trees=10) + pipe.set_params(learner__number_of_trees=10) pipe.fit(X_train, y_train) metrics10, _ = pipe.test(X_train, y_train) @@ -147,17 +147,23 @@ def test_uciadult_sweep(self): platform.linux_distribution()[1] != "16.04"), "not supported on this platform") def test_NGramFeaturizer_sweep(self): - # grid search over num_trees and then confirm the best num_trees by + # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) data = pd.DataFrame( { 'review': [ + 'I like this movie', + 'I don\'t like this', + 'It is nice', 'I like this movie', 'I don\'t like this', 'It is nice', 'So boring'], 'sentiment': [ + 'pos', + 'neg', + 'pos', 'pos', 'neg', 'pos', @@ -167,24 +173,24 @@ def test_NGramFeaturizer_sweep(self): ('ng', NGramFeaturizer( word_feature_extractor=Ngram(), - output_tokens=True, + output_tokens_column_name='review_TransformedText', columns='review')), WordEmbedding( columns='review_TransformedText', - model_kind='Sswe'), + model_kind='SentimentSpecificWordEmbedding'), ('lr', FastLinearBinaryClassifier( feature=[ 'review', 'review_TransformedText'], - train_threads=1, + number_of_threads=1, shuffle=False))]) - param_grid = dict(lr__max_iterations=[1, 20]) + param_grid = dict(lr__maximum_number_of_iterations=[1, 20]) grid = GridSearchCV(pipeline, param_grid) grid.fit(data['review'], 1 * (data['sentiment'] == 'pos')) - assert grid.best_params_['lr__max_iterations'] == 1 + assert grid.best_params_['lr__maximum_number_of_iterations'] == 20 # Problem with the SSL CA cert (path? access rights?) for the build # machines to download resources for wordembedding transform @@ -194,17 +200,23 @@ def test_NGramFeaturizer_sweep(self): platform.linux_distribution()[1] != "16.04"), "not supported on this platform") def test_NGramFeaturizer_glove(self): - # grid search over num_trees and then confirm the best num_trees by + # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) data = pd.DataFrame( { 'review': [ + 'I like this movie', + 'I don\'t like this', + 'It is nice', 'I like this movie', 'I don\'t like this', 'It is nice', 'So boring'], 'sentiment': [ + 'pos', + 'neg', + 'pos', 'pos', 'neg', 'pos', @@ -214,7 +226,7 @@ def test_NGramFeaturizer_glove(self): ('ng', NGramFeaturizer( word_feature_extractor=Ngram(), - output_tokens=True, + output_tokens_column_name='review_TransformedText', columns='review')), WordEmbedding( columns='review_TransformedText', @@ -224,14 +236,14 @@ def test_NGramFeaturizer_glove(self): feature=[ 'review', 'review_TransformedText'], - train_threads=1, + number_of_threads=1, shuffle=False))]) - param_grid = dict(lr__max_iterations=[1, 100, 20]) + param_grid = dict(lr__maximum_number_of_iterations=[1, 100, 20]) grid = GridSearchCV(pipeline, param_grid) grid.fit(data['review'], 1 * (data['sentiment'] == 'pos')) - assert grid.best_params_['lr__max_iterations'] == 1 + assert grid.best_params_['lr__maximum_number_of_iterations'] == 100 def test_clone_sweep(self): # grid search, then clone pipeline and grid search again @@ -243,10 +255,10 @@ def test_clone_sweep(self): label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns - learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5) + learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) - param_grid = dict(learner__num_trees=[1, 5, 10]) + param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) @@ -255,8 +267,8 @@ def test_clone_sweep(self): grid1.fit(X_train, y_train) assert grid.best_params_[ - 'learner__num_trees'] == grid1.best_params_[ - 'learner__num_trees'] + 'learner__number_of_trees'] == grid1.best_params_[ + 'learner__number_of_trees'] def test_error_conditions(self): # grid search on a wrong param @@ -267,7 +279,7 @@ def test_error_conditions(self): label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns - learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5) + learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__wrong_arg=[1, 5, 10]) diff --git a/src/python/nimbusml/tests/multiclass/test_onevsrestclassifier.py b/src/python/nimbusml/tests/multiclass/test_onevsrestclassifier.py index d2ce6ece..5cb6b386 100644 --- a/src/python/nimbusml/tests/multiclass/test_onevsrestclassifier.py +++ b/src/python/nimbusml/tests/multiclass/test_onevsrestclassifier.py @@ -81,10 +81,10 @@ def test_predict_proba_produces_distribution_sum_to_1(self): # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), - FastForestBinaryClassifier(min_split=1), + FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), - FastTreesBinaryClassifier(min_split=1), + FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), @@ -107,10 +107,10 @@ def test_failing_predict_proba_called_with_use_probabilites_false(self): # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), - FastForestBinaryClassifier(min_split=1), + FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), - FastTreesBinaryClassifier(min_split=1), + FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), @@ -127,10 +127,10 @@ def test_decision_function_produces_distribution_not_sum_to_1(self): # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), - FastForestBinaryClassifier(min_split=1), + FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), - FastTreesBinaryClassifier(min_split=1), + FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), @@ -151,10 +151,10 @@ def test_failing_decision_function_called_with_use_probabilites_true(self): # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), - FastForestBinaryClassifier(min_split=1), + FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), - FastTreesBinaryClassifier(min_split=1), + FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), @@ -170,13 +170,13 @@ def test_ovr_accuracy(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), - LogisticRegressionBinaryClassifier(train_threads=1), - FastForestBinaryClassifier(min_split=1, train_threads=1), - GamBinaryClassifier(train_threads=1), + LogisticRegressionBinaryClassifier(number_of_threads=1), + FastForestBinaryClassifier(minimum_example_count_per_leaf=1, number_of_threads=1), + GamBinaryClassifier(number_of_threads=1), AveragedPerceptronBinaryClassifier(), - FastTreesBinaryClassifier(min_split=1, train_threads=1), - FastLinearBinaryClassifier(train_threads=1), - SgdBinaryClassifier(train_threads=1), + FastTreesBinaryClassifier(minimum_example_count_per_leaf=1, number_of_threads=1), + FastLinearBinaryClassifier(number_of_threads=1), + SgdBinaryClassifier(number_of_threads=1), # SymSgdBinaryClassifier(number_of_threads=1), ] diff --git a/src/python/nimbusml/tests/pipeline/test_clone.py b/src/python/nimbusml/tests/pipeline/test_clone.py index 6ffbc0de..3049f2c3 100644 --- a/src/python/nimbusml/tests/pipeline/test_clone.py +++ b/src/python/nimbusml/tests/pipeline/test_clone.py @@ -177,8 +177,8 @@ def test_nofit_pipeline_clone(self): LightGbmRanker(feature=features, label='label_1', group_id='group_2', - num_boost_round=1, - num_leaves=4) + number_of_iterations=1, + number_of_leaves=4) ]) clone_and_check(pipe) @@ -187,14 +187,14 @@ def test_pipeline_clone_dataframe_roles_arguments(self): LightGbmRanker(feature=features, label='label_1', group_id='group_2', - num_boost_round=1, - num_leaves=4) + number_of_iterations=1, + number_of_leaves=4) ]) fit_test_clone_and_check(pipe, df) def test_pipeline_clone_dataframe_roles_shift_operator(self): pipe = Pipeline([ - LightGbmRanker(num_boost_round=1, num_leaves=4) << { + LightGbmRanker(number_of_iterations=1, number_of_leaves=4) << { Role.Feature: features, Role.Label: 'label_1', Role.GroupId: 'group_2'} @@ -207,15 +207,15 @@ def test_pipeline_clone_filedatastream_roles_arguments(self): LightGbmRanker(feature=features, label='label_1', group_id='group_2', - num_boost_round=1, - num_leaves=4) + number_of_iterations=1, + number_of_leaves=4) ]) fit_test_clone_and_check(pipe, fds) def test_pipeline_clone_filedatastream_roles_shift_operator(self): pipe = Pipeline([ ToKey() << {'group_2': 'group_2'}, - LightGbmRanker(num_boost_round=1, num_leaves=4) << { + LightGbmRanker(number_of_iterations=1, number_of_leaves=4) << { Role.Feature: features, Role.Label: 'label_1', Role.GroupId: 'group_2'} diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index f163e78c..309650b5 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -39,7 +39,7 @@ def test_model_dataframe(self): ('linear', FastLinearBinaryClassifier( shuffle=False, - train_threads=1))]) + number_of_threads=1))]) model_nimbusml.fit(train, label) @@ -80,7 +80,7 @@ def test_model_datastream(self): ('linear', FastLinearBinaryClassifier( shuffle=False, - train_threads=1))]) + number_of_threads=1))]) model_nimbusml.fit(train, label) diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_syntax.py b/src/python/nimbusml/tests/pipeline/test_pipeline_syntax.py index 4f0914b8..b4a842cb 100644 --- a/src/python/nimbusml/tests/pipeline/test_pipeline_syntax.py +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_syntax.py @@ -48,9 +48,9 @@ def test_pipeline_name_error(self): "'minsplit'] are not allowed" with self.assertRaises(NameError, msg=msg): LightGbmClassifier(min_data=1, min_data_in_bin=1, - min_data_per_leaf=1, + minimum_example_count_per_leaf=1, minsplit=1, NumLeaves=2) - + @unittest.skip def test_pipeline_with_no_columns_raise(self): trainData = pd.DataFrame( { @@ -111,7 +111,7 @@ def test_pipeline_with_no_columns(self): ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), - LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) + LightGbmClassifier(minimum_example_count_per_leaf=1, minimum_example_count_per_group=1) ]) assert ppl is not None @@ -124,7 +124,7 @@ def test_pipeline_with_no_columns(self): ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), - LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) + LightGbmClassifier(minimum_example_count_per_leaf=1, minimum_example_count_per_group=1) ]) assert ppl is not None ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"])) diff --git a/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py b/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py index 34372b30..21aa24a0 100644 --- a/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py +++ b/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py @@ -76,7 +76,7 @@ def test_pass_predict_proba_binary(self): assert_almost_equal( proba_sum( LogisticRegressionBinaryClassifier( - train_threads=1)), + number_of_threads=1)), 38.0, decimal=3, err_msg=invalid_predict_proba_output) @@ -84,7 +84,7 @@ def test_pass_predict_proba_binary(self): def test_pass_predict_proba_binary_with_pipeline(self): assert_almost_equal( proba_sum(Pipeline([LogisticRegressionBinaryClassifier( - train_threads=1)])), 38.0, decimal=3, + number_of_threads=1)])), 38.0, decimal=3, err_msg=invalid_predict_proba_output) def test_pass_predict_proba_multiclass(self): @@ -105,7 +105,7 @@ def test_pass_predict_proba_multiclass_with_pipeline(self): err_msg=invalid_predict_proba_output) def test_pass_predict_proba_multiclass_3class(self): - clf = FastLinearClassifier(train_threads=1) + clf = FastLinearClassifier(number_of_threads=1) clf.fit(X_train_3class, y_train_3class) s = clf.predict_proba(X_test_3class).sum() assert_almost_equal( @@ -146,12 +146,12 @@ def test_pass_predict_proba_from_load_model(selfs): class TestDecisionFunction(unittest.TestCase): def test_pass_decision_function_binary(self): assert_almost_equal(decfun_sum(FactorizationMachineBinaryClassifier( - )), -38.384098, decimal=5, err_msg=invalid_decision_function_output) + )), -32.618393, decimal=5, err_msg=invalid_decision_function_output) def test_pass_decision_function_binary_with_pipeline(self): assert_almost_equal( decfun_sum(Pipeline([FactorizationMachineBinaryClassifier( - )])), -38.384098, decimal=5, + )])), -32.618393, decimal=5, err_msg=invalid_decision_function_output) def test_pass_decision_function_multiclass(self): @@ -164,7 +164,7 @@ def test_pass_decision_function_multiclass_with_pipeline(self): )])), -96.87325, decimal=4, err_msg=invalid_decision_function_output) def test_pass_decision_function_multiclass_3class(self): - clf = FastLinearClassifier(train_threads=1) + clf = FastLinearClassifier(number_of_threads=1) clf.fit(X_train_3class, y_train_3class) s = clf.decision_function(X_test_3class).sum() assert_almost_equal( diff --git a/src/python/nimbusml/tests/pipeline/test_score_method.py b/src/python/nimbusml/tests/pipeline/test_score_method.py index 37ce45b4..0d1eff21 100644 --- a/src/python/nimbusml/tests/pipeline/test_score_method.py +++ b/src/python/nimbusml/tests/pipeline/test_score_method.py @@ -27,7 +27,7 @@ def test_score_binary(self): X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) - lr = LogisticRegressionBinaryClassifier(train_threads=1) + lr = LogisticRegressionBinaryClassifier(number_of_threads=1) e = Pipeline([lr]) e.fit(X_train, y_train) metrics = e.score(X_test, y_test) @@ -47,7 +47,7 @@ def test_score_multiclass(self): X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) - lr = LogisticRegressionClassifier(train_threads=1) + lr = LogisticRegressionClassifier(number_of_threads=1) e = Pipeline([lr]) e.fit(X_train, y_train.to_frame()) metrics = e.score(X_test, y_test) @@ -67,7 +67,7 @@ def test_score_regressor(self): X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) - lr = FastTreesRegressor(train_threads=1) + lr = FastTreesRegressor(number_of_threads=1) e = Pipeline([lr]) e.fit(X_train, y_train.to_frame()) metrics = e.score(X_test, y_test) @@ -89,8 +89,8 @@ def test_score_clusterer(self): lr = KMeansPlusPlus( n_clusters=2, - init_algorithm="Random", - train_threads=1) + initialization_algorithm="Random", + number_of_threads=1) e = Pipeline([lr]) e.fit(X_train, y_train.to_frame()) metrics = e.score(X_test, y_test) @@ -115,9 +115,9 @@ def test_score_anomalydetection(self): svm = OneClassSvmAnomalyDetector() # noqa e = Pipeline([svm]) e.fit(X_train) - if e.nodes[-1].label_column_ is not None: + if e.nodes[-1].label_column_name_ is not None: raise ValueError("'{0}' should be None".format( - e.nodes[-1].label_column_)) + e.nodes[-1].label_column_name_)) assert y_test.name == 'Setosa' metrics = e.score(X_test, y_test) print(metrics) @@ -156,22 +156,22 @@ def test_score_ranking(self): metrics, _ = pipeline.test(eval_stream) assert_almost_equal( metrics['NDCG@1'][0], - 43.571429, - decimal=5, + 0.43571429, + decimal=7, err_msg="NDCG@1 should be %s" % - 43.571429) + 0.43571429) assert_almost_equal( metrics['NDCG@2'][0], - 51.28226, - decimal=5, + 0.5128226, + decimal=7, err_msg="NDCG@2 should be %s" % - 51.28226) + 0.5128226) assert_almost_equal( metrics['NDCG@3'][0], - 55.168069, - decimal=5, + 0.55168069, + decimal=7, err_msg="NDCG@3 should be %s" % - 55.168069) + 0.55168069) assert_almost_equal( metrics['DCG@1'][0], 4.688759, diff --git a/src/python/nimbusml/tests/pipeline/test_uci_adult.py b/src/python/nimbusml/tests/pipeline/test_uci_adult.py index 42ba4f47..990f0b72 100644 --- a/src/python/nimbusml/tests/pipeline/test_uci_adult.py +++ b/src/python/nimbusml/tests/pipeline/test_uci_adult.py @@ -37,7 +37,7 @@ class TestUciAdult(unittest.TestCase): def test_file_no_schema(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, - FastLinearBinaryClassifier(train_threads=1, + FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)]) assert_raises_regex( TypeError, @@ -54,7 +54,7 @@ def test_file_no_schema(self): def test_linear_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, - FastLinearBinaryClassifier(train_threads=1, + FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)]) train_stream = FileDataStream(train_file, schema=file_schema) @@ -67,7 +67,7 @@ def test_linear_file(self): def test_linear_file_role(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, - FastLinearBinaryClassifier(train_threads=1, + FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)]) train_stream = FileDataStream(train_file, schema=file_schema) train_stream._set_role('Label', label_column) @@ -79,7 +79,7 @@ def test_linear_file_role(self): def test_linear_file_role2(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier( - train_threads=1, shuffle=False) << { + number_of_threads=1, shuffle=False) << { 'Label': label_column}]) train_stream = FileDataStream(train_file, schema=file_schema) train_stream._set_role('Label', label_column) @@ -102,7 +102,7 @@ def test_linear(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, - FastLinearBinaryClassifier(train_threads=1, + FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)]) pipeline.fit(train, label) out_data = pipeline.predict(test) @@ -112,7 +112,7 @@ def test_linear_with_train_schema(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, - FastLinearBinaryClassifier(train_threads=1, + FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)]) pipeline.fit(train, label) out_data = pipeline.predict(test) @@ -122,7 +122,7 @@ def test_linear_with_test_schema(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, - FastLinearBinaryClassifier(train_threads=1, + FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)]) pipeline.fit(train, label) out_data = pipeline.predict(test) @@ -132,7 +132,7 @@ def test_linear_with_train_test_schema(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, - FastLinearBinaryClassifier(train_threads=1, + FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)]) pipeline.fit(train, label) out_data = pipeline.predict(test) diff --git a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py index 7f6d2e0f..9b072af4 100644 --- a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py +++ b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py @@ -65,7 +65,7 @@ def test_input_types(self): 1.1, 2.2, 3.3, np.nan, 5.5], f1=[ 2.2, np.nan, 4.4, 5.5, 6.6])) h = Handler(replace_with='Mean') - ft = FastLinearRegressor(shuffle=False, train_threads=1) + ft = FastLinearRegressor(shuffle=False, number_of_threads=1) p = Pipeline([h, ft]) p.fit(df[['f', 'f1']].values, df['Label']) res = p.predict(df[['f', 'f1']].values) diff --git a/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py b/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py index 42543f88..592d1665 100644 --- a/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py +++ b/src/python/nimbusml/tests/preprocessing/text/test_ngramfeaturizer.py @@ -204,7 +204,7 @@ def test_ngramfeaturizer_single(self): columns={'features': ['id', 'education']}) features = xf.fit_transform(data) - assert features.shape == (248, 652) + assert features.shape == (248, 637) def test_ngramfeaturizer_multi(self): diff --git a/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py b/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py index 380c1623..503c21a6 100644 --- a/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py +++ b/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py @@ -58,7 +58,7 @@ def test_linear(self): ('linear', FastLinearBinaryClassifier( shuffle=False, - train_threads=1))]) + number_of_threads=1))]) pipe.fit(train, label) out_data = pipe.predict(test) check_accuracy_scikit(test_file, label_column, out_data, 0.779) @@ -90,7 +90,7 @@ def test_feature_union(self): pipe = Pipeline( steps=[ ('fu', fu), ('linear', FastLinearBinaryClassifier( - shuffle=False, train_threads=1))]) + shuffle=False, number_of_threads=1))]) pipe.fit(train, label) out_data = pipe.predict(test) check_accuracy_scikit(test_file, label_column, out_data, 0.709) @@ -284,21 +284,21 @@ def test_pipeline_grid_search(self): if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' - ftree = FastTreesBinaryClassifier(num_trees=5) + ftree = FastTreesBinaryClassifier(number_of_trees=5) pipe = Pipeline( steps=[ ("cat", cat), ('pca', PCA(5)), ("ftree", ftree)]) grid = GridSearchCV(pipe, dict(pca__n_components=[2], - ftree__num_trees=[11])) + ftree__number_of_trees=[11])) grid.fit(X_train, y_train) assert grid.best_params_ == { - 'ftree__num_trees': 11, + 'ftree__number_of_trees': 11, 'pca__n_components': 2} steps = grid.best_estimator_.steps ft = steps[-1][1] - num_trees = ft.num_trees - assert num_trees == 11 + number_of_trees = ft.number_of_trees + assert number_of_trees == 11 def test_lr_named_steps_iris(self): iris = load_iris() diff --git a/src/python/nimbusml/tests/test_data_schema.py b/src/python/nimbusml/tests/test_data_schema.py index f63b38ca..3b48266e 100644 --- a/src/python/nimbusml/tests/test_data_schema.py +++ b/src/python/nimbusml/tests/test_data_schema.py @@ -497,7 +497,7 @@ def test_schema_sep_default(self): add_sep=True) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 " \ "header=+ sep=," exp = Pipeline([OneHotVectorizer(columns=['text']), - LightGbmRegressor(min_data_per_leaf=1)]) + LightGbmRegressor(minimum_example_count_per_leaf=1)]) exp.fit(ds, 'y') pred = exp.predict(ds) assert pred is not None diff --git a/src/python/nimbusml/tests/test_data_types.py b/src/python/nimbusml/tests/test_data_types.py index 617fda64..ed8643d2 100644 --- a/src/python/nimbusml/tests/test_data_types.py +++ b/src/python/nimbusml/tests/test_data_types.py @@ -113,7 +113,7 @@ def test_dtype(xtype=None, ytype=None, dense=False): ydata = ydata.astype(ytype) assert ydata.dtype == ytype - algo = FastLinearBinaryClassifier(max_iterations=2) + algo = FastLinearBinaryClassifier(maximum_number_of_iterations=2) algo.fit(xdata, ydata) assert algo.model_ is not None @@ -155,7 +155,7 @@ def test_data_types(self): "================ Testing sparse xtype %s, ytype %s " "================" % (str(xtype), str(ytype))) - if (xtype == np.float16 or ytype == np.float16): + if (xtype == np.uint64 or xtype == np.float16 or ytype == np.float16): assert_raises( (TypeError, ValueError, RuntimeError), test_dtype, xtype, ytype) diff --git a/src/python/nimbusml/tests/test_entrypoints.py b/src/python/nimbusml/tests/test_entrypoints.py index 6b0beb09..257d5bef 100644 --- a/src/python/nimbusml/tests/test_entrypoints.py +++ b/src/python/nimbusml/tests/test_entrypoints.py @@ -51,13 +51,13 @@ def test_trainers_logisticregressionbinaryclassifier(self): node = trainers_logisticregressionbinaryclassifier( training_data=training_data, quiet=quiet, - label_column=label_column, + label_column_name=label_column, predictor_model=predictor_model) # check assert isinstance(node, EntryPoint) assert node.inputs["TrainingData"] == training_data assert node.inputs["Quiet"] == quiet - assert node.inputs["LabelColumn"] == label_column + assert node.inputs["LabelColumnName"] == label_column assert node.input_variables == {training_data} assert node.output_variables == {predictor_model} diff --git a/src/python/nimbusml/tests/test_syntax.py b/src/python/nimbusml/tests/test_syntax.py index 181cfaa4..27c1c3b3 100644 --- a/src/python/nimbusml/tests/test_syntax.py +++ b/src/python/nimbusml/tests/test_syntax.py @@ -37,7 +37,7 @@ def test_syntax1(self): exp = Pipeline([ OneHotVectorizer(), - FastLinearBinaryClassifier(max_iterations=1) + FastLinearBinaryClassifier(maximum_number_of_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) @@ -57,7 +57,7 @@ def test_syntax2(self): exp = Pipeline([ OneHotVectorizer() << 'education', OneHotVectorizer(max_num_terms=2) << 'workclass', - FastLinearBinaryClassifier(max_iterations=1) + FastLinearBinaryClassifier(maximum_number_of_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) @@ -77,7 +77,7 @@ def test_syntax2_lt(self): exp = Pipeline([ OneHotVectorizer() << 'education', OneHotVectorizer(max_num_terms=2) << 'workclass', - FastLinearBinaryClassifier(max_iterations=1) + FastLinearBinaryClassifier(maximum_number_of_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) @@ -103,7 +103,7 @@ def test_syntax3(self): # does not do what the syntax implicetely tells. # We need to modify either the bridge to look into # every available column at one step. - FastLinearBinaryClassifier(max_iterations=1) + FastLinearBinaryClassifier(maximum_number_of_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) @@ -125,7 +125,7 @@ def test_syntax4(self): OneHotHashVectorizer() << {'edu2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'wki': 'workclass'}, Concat() << {'Inputs': ['edu1', 'edu2', 'wki']}, - FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) @@ -147,7 +147,7 @@ def test_syntax4_2(self): OneHotHashVectorizer() << {'edu2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'wki': 'workclass'}, Concat() << {'Inputs': ['edu1', 'edu2', 'wki']}, - FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) @@ -169,7 +169,7 @@ def test_syntax4_dict(self): OneHotHashVectorizer() << {'edu2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'wki': 'workclass'}, Concat() << {'Inputs': ['edu1', 'edu2', 'wki']}, - FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) @@ -191,7 +191,7 @@ def test_syntax4_columns(self): OneHotHashVectorizer(columns={'edu2': 'education'}), OneHotVectorizer(max_num_terms=2, columns={'wki': 'workclass'}), Concat(columns={'Inputs': ['edu1', 'edu2', 'wki']}), - FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) @@ -214,7 +214,7 @@ def test_syntax4_fail(self): OneHotVectorizer() << {'edu1': 'education'}, OneHotHashVectorizer() << {'edu2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'wki': 'workclass'}, - FastLinearBinaryClassifier(max_iterations=1) << ['edu1', 'edu2', + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['edu1', 'edu2', 'wki'] ]) try: @@ -238,7 +238,7 @@ def test_syntax4_fail2(self): OneHotVectorizer() << {'edu1': 'education'}, OneHotHashVectorizer() << {'edu2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'wki': 'workclass'}, - FastLinearBinaryClassifier(max_iterations=1) << ['edu1', 'edu4', + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['edu1', 'edu4', 'wki'] ]) try: @@ -259,7 +259,7 @@ def test_syntax5(self): OneHotHashVectorizer() << {'f2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, - FastLinearBinaryClassifier(max_iterations=1) << 'Features' + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'Features' ]) exp.fit(X, y) prediction = exp.predict(X) @@ -287,7 +287,7 @@ def test_syntax5_regular_expression(self): OneHotHashVectorizer() << {'f2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': 'f[0-9]+'}, - FastLinearBinaryClassifier(max_iterations=1) << 'Features' + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'Features' ]) exp.fit(X, y) prediction = exp.predict(X) @@ -310,7 +310,7 @@ def test_syntax6(self): OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], - FastLinearBinaryClassifier(max_iterations=1) << ['Features'] + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features'] ]) exp.fit(X, y) prediction = exp.predict(X) @@ -333,7 +333,7 @@ def test_syntax6_not_features(self): OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'FeaturesCustom': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], - FastLinearBinaryClassifier(max_iterations=1) << 'FeaturesCustom' + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'FeaturesCustom' ]) exp.fit(X, y) prediction = exp.predict(X) @@ -362,7 +362,7 @@ def test_syntax6_change_role(self): OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], - FastLinearBinaryClassifier(max_iterations=1) << ['Features'] + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features'] ]) exp.fit(X, y) prediction = exp.predict(X) @@ -386,7 +386,7 @@ def test_syntax6_regular_expression(self): OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << '~Features', - FastLinearBinaryClassifier(max_iterations=1) + FastLinearBinaryClassifier(maximum_number_of_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) @@ -518,7 +518,7 @@ def test_syntax11_learner(self): OneHotVectorizer() << { 'edu1': 'education'}, OneHotHashVectorizer() << { 'edu2': 'education'}, FastLinearBinaryClassifier( - max_iterations=1) << { + maximum_number_of_iterations=1) << { 'Features': ['edu1', 'edu2'], Role.Label: 'y'}]) exp.fit(df) prediction = exp.predict(X) @@ -542,7 +542,7 @@ def test_syntax11_append_insert(self): exp.insert(0, OneHotVectorizer() << {'edu1': 'education'}) exp.append( FastLinearBinaryClassifier( - max_iterations=1) << { + maximum_number_of_iterations=1) << { 'Features': [ 'edu1', 'edu2'], diff --git a/src/python/nimbusml/tests/test_syntax_learner.py b/src/python/nimbusml/tests/test_syntax_learner.py index 98cb7504..2c649304 100644 --- a/src/python/nimbusml/tests/test_syntax_learner.py +++ b/src/python/nimbusml/tests/test_syntax_learner.py @@ -15,7 +15,7 @@ from nimbusml.internal.utils.data_roles import Role from nimbusml.linear_model import AveragedPerceptronBinaryClassifier from nimbusml.linear_model import FastLinearBinaryClassifier, \ - FastLinearRegressor + FastLinearRegressor, OnlineGradientDescentRegressor from nimbusml.preprocessing import ToKey from nimbusml.preprocessing.normalization import MeanVarianceScaler from nimbusml.preprocessing.schema import ColumnConcatenator as Concat, \ @@ -46,7 +46,7 @@ def test_syntax7(self): OneHotVectorizer() << 'y', OneHotVectorizer() << ['workclass', 'education'], TypeConverter(result_type='R4') << 'y', - FastLinearBinaryClassifier(max_iterations=1) + FastLinearBinaryClassifier(maximum_number_of_iterations=1) ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) @@ -83,7 +83,7 @@ def test_syntax7_rename(self): OneHotVectorizer() << ['workclass', 'education'], TypeConverter(result_type='R4') << {'yi': 'y'}, Drop() << 'y', - FastLinearBinaryClassifier(max_iterations=1) << 'yi' + FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'yi' ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) @@ -107,8 +107,8 @@ def test_syntax8_label(self): Role.Label: 'new_y'} ]) exp.fit(df, verbose=0) - assert exp.nodes[-1].feature_column_ == 'Features' - assert exp.nodes[-1].label_column_ == 'new_y' + assert exp.nodes[-1].feature_column_name_ == 'Features' + assert exp.nodes[-1].label_column_name_ == 'new_y' # The pipeline requires it now as it is transformed all along. X['yy'] = 0.0 prediction = exp.predict(X, verbose=0) @@ -133,8 +133,8 @@ def test_syntax9_label_name(self): Role.Label: 'new_y'} ]) exp.fit(X, verbose=0) - assert exp.nodes[-1].feature_column_ == 'Features' - assert exp.nodes[-1].label_column_ == 'new_y' + assert exp.nodes[-1].feature_column_name_ == 'Features' + assert exp.nodes[-1].label_column_name_ == 'new_y' # The pipeline requires it now as it is transformed all along. X['yy'] = 0.0 prediction = exp.predict(X) @@ -157,7 +157,7 @@ def test_syntax10_weights_fail(self): exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], - FastLinearRegressor() + OnlineGradientDescentRegressor() ]) try: exp.fit(X, y, weight=weights, verbose=0) @@ -180,9 +180,9 @@ def test_syntax10_weights(self): FastLinearRegressor() ]) exp.fit(X, y, weight=w, verbose=0) - assert exp.nodes[-1].feature_column == 'Features' - assert exp.nodes[-1].label_column == 'y' - assert exp.nodes[-1].weight_column == 'weight' + assert exp.nodes[-1].feature_column_name == 'Features' + assert exp.nodes[-1].label_column_name == 'y' + assert exp.nodes[-1].example_weight_column_name == 'weight' X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) @@ -211,14 +211,14 @@ def test_syntax10_weights_operator(self): 'workclass', 'education']}, FastTreesRegressor( - num_trees=5) << { + number_of_trees=5) << { 'Feature': 'Feature', Role.Label: 'y', Role.Weight: 'weight'}]) exp.fit(X, verbose=0) - assert exp.nodes[-1].feature_column_ == 'Feature' - assert exp.nodes[-1].label_column_ == 'y' - assert exp.nodes[-1].weight_column_ == 'weight' + assert exp.nodes[-1].feature_column_name_ == 'Feature' + assert exp.nodes[-1].label_column_name_ == 'y' + assert exp.nodes[-1].example_weight_column_name_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. @@ -238,13 +238,13 @@ def test_syntax11_constructor(self): exp = Pipeline([ OneHotVectorizer(columns=['workclass', 'education']), Concat(columns={'Feature': ['workclass', 'education']}), - FastTreesRegressor(num_trees=5, feature='Feature', label='y', + FastTreesRegressor(number_of_trees=5, feature='Feature', label='y', weight='weight') ]) exp.fit(X, verbose=0) - assert exp.nodes[-1].feature_column_ == 'Feature' - assert exp.nodes[-1].label_column_ == 'y' - assert exp.nodes[-1].weight_column_ == 'weight' + assert exp.nodes[-1].feature_column_name_ == 'Feature' + assert exp.nodes[-1].label_column_name_ == 'y' + assert exp.nodes[-1].example_weight_column_name_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. @@ -264,13 +264,13 @@ def test_syntax12_mixed1(self): exp = Pipeline([ OneHotVectorizer(columns=['workclass', 'education']), Concat(columns={'Feature': ['workclass', 'education']}), - FastTreesRegressor(num_trees=5, label='y', + FastTreesRegressor(number_of_trees=5, label='y', weight='weight') << 'Feature' ]) exp.fit(X, verbose=0) - assert exp.nodes[-1].feature_column_ == 'Feature' - assert exp.nodes[-1].label_column_ == 'y' - assert exp.nodes[-1].weight_column_ == 'weight' + assert exp.nodes[-1].feature_column_name_ == 'Feature' + assert exp.nodes[-1].label_column_name_ == 'y' + assert exp.nodes[-1].example_weight_column_name_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. @@ -296,12 +296,12 @@ def test_syntax12_mixed2(self): columns={ 'Feature': ['workclass', 'education']}), FastTreesRegressor( - num_trees=5, feature='Feature', weight='weight') << { + number_of_trees=5, feature='Feature', weight='weight') << { Role.Label: 'y'}]) exp.fit(X, verbose=0) - assert exp.nodes[-1].feature_column_ == 'Feature' - assert exp.nodes[-1].label_column_ == 'y' - assert exp.nodes[-1].weight_column_ == 'weight' + assert exp.nodes[-1].feature_column_name_ == 'Feature' + assert exp.nodes[-1].label_column_name_ == 'y' + assert exp.nodes[-1].example_weight_column_name_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. @@ -323,22 +323,22 @@ def test_syntax12_group(self): OneHotVectorizer(columns=['workclass', 'education']), Concat(columns={'Feature': ['workclass', 'education']}), ToKey() << 'gr', - FastTreesRegressor(num_trees=5, feature='Feature', + FastTreesRegressor(number_of_trees=5, feature='Feature', group_id='gr') << {Role.Label: 'y'} ]) exp.fit(X, verbose=0) assert not hasattr(exp.nodes[-1], 'feature_') assert not hasattr(exp.nodes[-1], 'group_id_') - assert exp.nodes[-1].feature_column_ == 'Feature' - assert exp.nodes[-1].label_column_ == 'y' - # assert not hasattr(exp.nodes[-1], 'group_id_column_') + assert exp.nodes[-1].feature_column_name_ == 'Feature' + assert exp.nodes[-1].label_column_name_ == 'y' + # assert not hasattr(exp.nodes[-1], 'row_group_column_name_') assert not hasattr(exp.nodes[-1], 'group_id_column') assert not hasattr(exp.nodes[-1], 'groupid_column_') assert not hasattr(exp.nodes[-1], 'groupid_column') - if not hasattr(exp.nodes[-1], 'group_id_column_'): + if not hasattr(exp.nodes[-1], 'row_group_column_name_'): raise AssertionError("Attribute not found: {0}".format( ", ".join(sorted(dir(exp.nodes[-1]))))) - assert exp.nodes[-1].group_id_column_ == 'gr' + assert exp.nodes[-1].row_group_column_name_ == 'gr' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. diff --git a/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py b/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py index b48cf7a4..556271af 100644 --- a/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py +++ b/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py @@ -41,7 +41,7 @@ def test_syntax1_passing(self): exp = Pipeline([ OneHotVectorizer() << {'f1': 'education2'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, - LightGbmClassifier(min_data_per_leaf=1) << ['f1', 'f3'] + LightGbmClassifier(minimum_example_count_per_leaf=1) << ['f1', 'f3'] ]) exp.fit(X, y) res = exp.transform(X) diff --git a/src/python/nimbusml/tests/test_utils.py b/src/python/nimbusml/tests/test_utils.py index d02b5600..48f2241a 100644 --- a/src/python/nimbusml/tests/test_utils.py +++ b/src/python/nimbusml/tests/test_utils.py @@ -18,8 +18,8 @@ def check_supported_losses(testcase, learner, losses, acc_threshold): # 247514 for that work. learner_args = getargspec(learner.__init__).args kwargs = {} - if 'train_threads' in learner_args and 'shuffle' in learner_args: - kwargs.update({'train_threads': 1, 'shuffle': False}) + if 'number_of_threads' in learner_args and 'shuffle' in learner_args: + kwargs.update({'number_of_threads': 1, 'shuffle': False}) for l in losses: kwargs['loss'] = l accuracy = get_accuracy(testcase, learner(**kwargs)) diff --git a/src/python/nimbusml/tests/utils/test_exports.py b/src/python/nimbusml/tests/utils/test_exports.py index 3dcaf7e3..96d1ddfa 100644 --- a/src/python/nimbusml/tests/utils/test_exports.py +++ b/src/python/nimbusml/tests/utils/test_exports.py @@ -69,17 +69,18 @@ def test_object_parameters(self): Role.Label: 'new_y'} exp = {'bias_learning_rate': 1.0, 'caching': 'Auto', - 'check_frequency': None, + 'convergence_check_frequency': None, 'convergence_tolerance': 0.01, 'feature': ['workclass', 'education'], 'l1_threshold': None, - 'l2_weight': None, + 'l2_regularization': None, 'label': 'new_y', 'loss': 'squared', - 'max_iterations': None, + 'maximum_number_of_iterations': None, 'normalize': 'Auto', 'shuffle': True, - 'train_threads': None} + 'weight': None, + 'number_of_threads': None} assert obj3.get_params() == exp def test_object_clone(self): @@ -308,9 +309,9 @@ def test_pipeline_exports(self): ]) for node in exp.nodes: - if hasattr(node, 'label_column'): - assert node.label_column == 'new_y' - assert exp.nodes[-1].label_column == 'new_y' + if hasattr(node, 'label_column_name'): + assert node.label_column_name == 'new_y' + assert exp.nodes[-1].label_column_name == 'new_y' res = dot_export_pipeline(exp, df).strip("\n\r ") exp = """ @@ -564,10 +565,10 @@ def test_word_embedding(self): False, True])) - ng = NGramFeaturizer(columns=['description'], output_tokens=True) + ng = NGramFeaturizer(columns=['description'], output_tokens_column_name='description_TransformedText') we = WordEmbedding( columns='description_TransformedText', - model_kind='Sswe') + model_kind='SentimentSpecificWordEmbedding') model = Pipeline([ng, we]) dot_vis = dot_export_pipeline(model, ds_train) diff --git a/src/python/setup.py b/src/python/setup.py index 213acaa2..e1059ce6 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -44,7 +44,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='0.7.0', + version='1.0.0', description='NimbusML', long_description=long_description, diff --git a/src/python/tests/test_docs_example.py b/src/python/tests/test_docs_example.py index e017b927..310f83ce 100644 --- a/src/python/tests/test_docs_example.py +++ b/src/python/tests/test_docs_example.py @@ -42,9 +42,10 @@ def test_examples(self): fold_files.sort() modpath = os.path.abspath(os.path.dirname(myfile)) - modpath = os.path.normpath( - os.path.join(os.path.join(modpath), '..')) + modpath = os.path.normpath(os.path.join(os.path.join(modpath), '..')) os.environ['PYTHONPATH'] = modpath + os.environ['PYTHONIOENCODING'] = 'UTF-8' + start = 0 ran = 0 excs = [] @@ -56,24 +57,24 @@ def test_examples(self): # Bug 294481: CharTokenizer_df fails # with error about variable length vector 'CharTokenizer_df.py', + # Bug todo: CustomStopWordsRemover fails on ML.NET side + 'NGramFeaturizer2.py', + # System.Drawings.Common.dll 4.0.0 is needed + 'Image.py', 'Image_df.py', ]: continue - if (os.name != "nt" and (platform.linux_distribution()[ - 0] != "Ubuntu" or - platform.linux_distribution()[ - 1] != "16.04")): - if name in { - 'Image.py', - 'Image_df.py', - 'DssmFeaturizer.py', - 'Sentiment.py'}: - # REVIEW: fix ssl issue on test centos7 & ubuntu14 - # boxes. - # Tests work on ubuntu16. - continue - if os.name != "nt" and six.PY2: - if name in {'NaiveBayesClassifier_df.py'}: + if os.name != "nt": + if name in [ + # SymSgdNative fails to load on linux + 'SymSgdBinaryClassifier.py', + 'SymSgdBinaryClassifier_infert_df.py', + # MICROSOFTML_RESOURCE_PATH needs to be setup on linux + 'WordEmbedding.py', + 'WordEmbedding_df.py', + 'NaiveBayesClassifier_df.py' + ]: continue + full = os.path.join(fold, name) cmd = '"{0}" -u "{1}"'.format( sys.executable.replace( @@ -113,6 +114,9 @@ def test_examples(self): "Your CPU supports instructions that this TensorFlow", "CacheClassesFromAssembly: can't map name " "OLSLinearRegression to Void, already mapped to Void", + # Binner.py + "from collections import Mapping, defaultdict", + "DeprecationWarning: Using or importing the ABCs", # BootStrapSample.py "DeprecationWarning: the imp module is deprecated", # PipelineWithGridSearchCV2.py @@ -133,11 +137,13 @@ def test_examples(self): # TODO: Investigate. exps.append("RuntimeWarning: numpy.dtype size changed") - errors = stderr.split('\n') - for exp in exps: - errors = [_ for _ in errors if exp in _] + errors = None + if stderr != '': + errors = stderr.split('\n') + for exp in exps: + errors = [_ for _ in errors if exp not in _] - if errors: + if errors and (len(errors) > 1 or (len(errors) == 1 and errors[0] != '')): excs.append(RuntimeError( "Issue with\n File '{0}'\n--CMD\n{1}\n--ERR\n{2}\n--OUT\n" "{3}\n--".format(full, cmd, '\n'.join(errors), stdout))) diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index e4e9ec19..07b1453c 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -15,7 +15,7 @@ from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram from nimbusml.preprocessing import TensorFlowScorer -from nimbusml.preprocessing.filter import SkipFilter +from nimbusml.preprocessing.filter import SkipFilter, TakeFilter from sklearn.utils.estimator_checks import _yield_all_checks, MULTI_OUTPUT this = os.path.abspath(os.path.dirname(__file__)) @@ -170,16 +170,16 @@ INSTANCES = { 'LightGbmBinaryClassifier': LightGbmBinaryClassifier( - min_data_per_group=1, min_data_per_leaf=1), + minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier( - min_data_per_group=1, min_data_per_leaf=1), + minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor( - min_data_per_group=1, min_data_per_leaf=1), + minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRanker': LightGbmRanker( - min_data_per_group=1, min_data_per_leaf=1), - 'NGramFeaturizer': NGramFeaturizer( - word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter( - count=5), + minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), + 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), + 'SkipFilter': SkipFilter(count=5), + 'TakeFilter': TakeFilter(count=100000), 'TensorFlowScorer': TensorFlowScorer( model_location=os.path.join( this, @@ -254,6 +254,9 @@ def load_json(file_path): # skip LighGbm for now, because of random crashes. if 'LightGbm' in class_name: continue + # skip SymSgdBinaryClassifier for now, because of crashes. + if 'SymSgdBinaryClassifier' in class_name: + continue mod = __import__('nimbusml.' + e[0], fromlist=[str(class_name)]) the_class = getattr(mod, class_name) diff --git a/src/python/tools/code_fixer.py b/src/python/tools/code_fixer.py index bc240b39..6d927138 100644 --- a/src/python/tools/code_fixer.py +++ b/src/python/tools/code_fixer.py @@ -81,27 +81,13 @@ from ....internal.utils.utils import trace""" signature_fixes = { - 'DnnFeaturizer': [('source,', 'input = None,'), - ('name = None,', 'output = None,'), - ('source=source,', 'input=input,'), - ('name=name,', 'output=output,')], + 'SkipFilter': ('count = 0,', 'count,'), + 'TakeFilter': ('count = 9223372036854775807,', 'count,'), 'NGramFeaturizer': [(NG_1, NG_1_correct), ('word_feature_extractor = n_gram', 'word_feature_extractor = Ngram'), ('char_feature_extractor = n_gram', 'char_feature_extractor = Ngram')], - 'CountSelector': ('count = 0,', 'count = 1.0,'), - 'OneClassSvmAnomalyDetector': ( - 'label_column=label_column,', 'label_column=None,'), - 'RangeFilter': ('min = None,', 'min = -1,'), - # 'KMeansPlusPlus' : ('feature_column: str = \'Features\',', - # 'feature_column: str = \'Features\',\n - # label_column: str = \'Label\','), - 'SsweEmbedding': [('source,', 'input,'), - ('name = None,', 'output = None,'), - ('source=source,', 'source=input,'), - ('name=name,', 'name=output,')], - 'OneVsRestClassifier': ('nodes,', 'classifier,'), 'FactorizationMachineBinaryClassifier': (FM, FM_correct), 'OneHotHashVectorizer': (OHE, OHE_correct), 'CustomStopWordsRemover': (cust_stop, cust_stop_correct), @@ -113,30 +99,6 @@ def fix_code(class_name, filename): _fix_code(class_name, filename, signature_fixes) -dnnImageFeaturize_1 = """ def _get_node(self, **all_args): - algo_args = dict( - source=self.source, - name=self._name_or_source, - dnn_model=self.dnn_model)""" - -dnnImageFeaturize_1_correct = """ def _get_node(self, **all_args): - input_column = self.input - if input_column is None and 'input' in all_args: - input_column = all_args['input'][0] - if 'input' in all_args: - all_args.pop('input') - - output_column = self.output - if output_column is None and 'output' in all_args: - output_column = all_args['output'][0] - if 'output' in all_args: - all_args.pop('output') - - algo_args = dict( - source=input_column, - name=output_column, - dnn_model=self.dnn_model)""" - columnselector_1 = """ def _get_node(self, **all_args): algo_args = dict( keep_columns=self.keep_columns, @@ -247,31 +209,6 @@ def fix_code(class_name, filename): column=column )""" -expressionTransform_1 = \ - """ if output_columns is None and 'output' in all_args: - output_columns = all_args['output']""" - -expressionTransform_1_correct = \ - """ if output_columns is None \ - and 'output' in all_args: - output_columns = all_args['output'] - if isinstance(output_columns, list): - output_columns = output_columns[0]""" - -expressionTransform_2 = """ algo_args = dict( - column=[dict(Source=i, Name=o) for i, o in zip(input_columns, \ -output_columns)] if input_columns else None, - expression=self.expression,)""" - -expressionTransform_2_correct = """ source = [] - for i in input_columns: - source.append(i) - column = [dict([('Source', source), ('Name', output_columns)])] - - algo_args = dict( - column=column, - expression=self.expression)""" - onevsrestclassifier_1 = """ all_args.update(algo_args)""" onevsrestclassifier_1_correct = """ @@ -282,26 +219,11 @@ def fix_code(class_name, filename): all_args['predictor_model']}""" signature_fixes_core = { - 'DnnFeaturizer': [ # ('source,', 'input = None,'), - # ('name = None,', 'output = None,'), - ('self.source=source', 'self.input=input'), - ('self.name=name', 'self.output=output'), - (dnnImageFeaturize_1, dnnImageFeaturize_1_correct)], 'NGramFeaturizer': (textTransform_1, textTransform_1_correct), - 'CountSelector': ('count = 0,', 'count = 1.0,'), - 'ColumnConcatenator': [('output = None,', 'output = None,'), - (concatColumns_1, concatColumns_1_correct)], + 'ColumnConcatenator': [(concatColumns_1, concatColumns_1_correct)], 'ColumnSelector': [(columnselector_1, columnselector_1_correct)], - 'RangeFilter': ('min = None,', 'min = -1,'), - 'Expression': [(expressionTransform_1, expressionTransform_1_correct), - (expressionTransform_2, expressionTransform_2_correct)], 'OneVsRestClassifier': [ (onevsrestclassifier_1, onevsrestclassifier_1_correct)], - 'TensorFlowScorer': [ - ('model=self.model', 'model_location=self.model')], - 'Expression': ('zip(input_columns', - 'zip([[x] for x in input_columns] if not ' \ - 'isinstance(input_columns[0], list) else input_columns') } @@ -317,22 +239,7 @@ def fix_code_core(class_name, filename): outputs['PredictorModel'] = try_set(obj=model, \ none_acceptable=False, is_of_type=str)""" -tf_1_incorrect = """def transforms_tensorflowscorer( - model,""" - -tf_1_correct = """def transforms_tensorflowscorer( - model_location,""" - -tf_2_incorrect = """ if model is not None: - inputs['Model'] = try_set(obj=model""" - -tf_2_correct = """ if model_location is not None: - inputs['Model'] = try_set(obj=model_location""" - signature_fixes_entrypoint = { - 'SelectFeatures.CountSelect': ('count = 0,', 'count,'), - 'SelectRows.SkipFilter': ('count = 0,', 'count,'), - 'SelectRows.TakeFilter': ('count = 0,', 'count,'), 'Transforms.TextFeaturizer': ('column = 0,', 'column,'), 'Transforms.ManyHeterogeneousModelCombiner': [ ('predictor_model = None,', 'model = None,'), @@ -340,10 +247,6 @@ def fix_code_core(class_name, filename): 'Transforms.TwoHeterogeneousModelCombiner': [ ('predictor_model = None,', 'model = None,'), (s_1_incorrect, s_1_correct)], - 'Transforms.TensorFlowScorer': [ - (tf_1_incorrect, tf_1_correct), - (':param model: TensorFlow', ':param model_location: TensorFlow'), - (tf_2_incorrect, tf_2_correct)], 'Transforms.LightLda' : ('num_threads = 0,', 'num_threads = None,'), 'Trainers.GeneralizedAdditiveModelRegressor': ('Infinity', 'float("inf")'), 'Trainers.GeneralizedAdditiveModelBinaryClassifier': ( @@ -368,15 +271,6 @@ def _fix_code(class_name, filename, fixes_dict): code = f.read() first = True for fix in fixes: - #if fix[0] in code: - # if first: - # print(" [_fix_code]", os.path.abspath(filename)) - # first = False - # print( - # " '{0}' --> '{1}'".format( - # fix[0].replace( - # "\n", "\\n"), fix[1].replace( - # "\n", "\\n"))) code = code.replace(fix[0], fix[1]) f.seek(0) f.write(code) @@ -411,8 +305,10 @@ def run_autoflake(filename): parser.add_argument('--remove-all-unused-imports', action='store_true') cmd_args = ['--in-place', '--remove-all-unused-imports'] args = parser.parse_args(cmd_args) + args.check = None args.imports = None args.expand_star_imports = None args.remove_duplicate_keys = None args.remove_unused_variables = None + args.ignore_init_module_imports = False autoflake.fix_file(filename, args=args, standard_out=sys.stdout) diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index d437f5ae..f368f385 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -58,13 +58,43 @@ class Role: Feature = 'Feature' Label = 'Label' - Weight = 'Weight' - GroupId = 'GroupId' + Weight = 'ExampleWeight' + GroupId = 'RowGroup' + # unsupported roles below User = 'User' Item = 'Item' Name = 'Name' RowId = 'RowId' + @staticmethod + def get_column_name(role, suffix="ColumnName"): + """ + Converts a role into a column name + ``GroupId --> RowGroupColumnName``. + """ + if not isinstance(role, str): + raise TypeError("Unexpected role '{0}'".format(role)) + if role == "Weight": + return Role.Weight + suffix + if role == "GroupId": + return Role.GroupId + suffix + return role + suffix + + @staticmethod + def to_attribute(role, suffix="_column_name"): + """ + Converts a role into a tuple of pythonic original and extended name. + ``groupid --> (group_id, row_group_column_name)``. + """ + if not isinstance(role, str): + raise TypeError("Unexpected role '{0}'".format(role)) + if role == "weight": + return ("weight", "example_weight" + suffix) + if role == "groupid": + return ("group_id", "row_group" + suffix) + if role == "rowid": + return ("row_id", "row_id" + suffix) + return (role.lower(), role.lower() + suffix) _allowed_roles = set(k for k in Role.__dict__ if k[0].upper() == k[0]) @@ -602,7 +632,7 @@ def write_class( hidden = set(a.name for a in hidden_args) allowed_roles = sorted([k.lower() for k in _allowed_roles if - k + 'Column' in hidden]) + Role.get_column_name(k) in hidden]) sig_columns_roles = list(allowed_roles) base_file = "base_predictor" @@ -731,21 +761,17 @@ def write_class( body_sig_params = [] for h in sig_columns_roles: # add roles as allowed parameters - if h == 'groupid': - h = 'group_id' - elif h == 'colid': - h = 'col_id' - elif h == 'rowid': - h = 'row_id' if h == "columns": body_header += "\n if {0}: params['{0}'] = {0}".format( h) else: - body_header += "\n if '{0}_column' in params: raise " \ - "NameError(\"'{0}_column' must be renamed to " \ - "'{0}'\")".format(h) - body_header += "\n if {0}: params['{0}_column'] = {" \ - "0}".format(h) + body_header += "\n if '{1}' in params: raise " \ + "NameError(\"'{1}' must be renamed to " \ + "'{0}'\")".format(Role.to_attribute(h)[0], + Role.to_attribute(h)[1]) + body_header += "\n if {0}: params['{1}'] = {" \ + "0}".format(Role.to_attribute(h)[0], + Role.to_attribute(h)[1]) body_sig_params.append(h) if 'input_columns' in header and 'columns=' in header: body_header += "\n if columns: input_columns = " \ @@ -778,7 +804,7 @@ def write_class( for h in body_sig_params: body += ' self.{0}{1}={1}\n'.format( - '_' if h == 'columns' else '', h) + '_' if h == 'columns' else '', Role.to_attribute(h)[0]) if 'Predict_Proba' in entrypoint: if entrypoint['Predict_Proba'] is True: @@ -869,8 +895,9 @@ def write_core_class( module_doc = '"""\n{}\n"""\n'.format(class_name) hidden = set(a.name for a in hidden_args) - allowed_roles = [k.lower() - for k in _allowed_roles if k + 'Column' in hidden] + allowed_roles = sorted([k.lower() + for k in _allowed_roles if + Role.get_column_name(k) in hidden]) dots = '.' * (1 + class_dir.count('.')) @@ -1221,7 +1248,7 @@ def write_core_class( if len(columns_entrypoint) > 0: for c in columns_entrypoint: name = c.new_name_converted - if name.endswith('_column'): + if name.endswith('_column_name'): tail_snip += "\n {0}=self._getattr_role('{0}', " \ "all_args),".format(name) elif name == "source" or c.name == "Source": @@ -1536,6 +1563,7 @@ def __init__(self, argument, inout): # dict self.default = argument.get('Default', Missing()) self.required = argument.get('Required', Missing()) self.aliases = argument.get('Aliases', Missing()) + self.pass_as = argument.get('PassAs', None) self.name_converted = convert_name(self.name) self.new_name_converted = convert_name( @@ -1545,15 +1573,9 @@ def __init__(self, argument, inout): # dict self.new_name) self.name_assignment = self.new_name_converted self.name_core_assignment = self.new_name_converted - # self.name_annotated = '{}: """{}"""'.format(self.name, self.type) self.name_annotated = '{}: {}'.format( self.new_name_converted, self.type_python) - # NOTE: the default values specified in the - # manifest.json for some inputs do not work. - if self.name in ('WeightColumn', 'GroupIdColumn', 'GroupColumn'): - self.default = None - def __str__(self): return self.name @@ -1596,7 +1618,7 @@ def get_body(self): "is_of_type=numbers.Real" body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) if not isinstance(self.range, Missing): @@ -1627,7 +1649,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=bool" body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1674,7 +1696,7 @@ def get_body(self): template += ", is_column=True" body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1698,7 +1720,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=str" body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) value_check = ", values={0}".format(str(self.type['Values'])) @@ -1729,7 +1751,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=list" body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1771,7 +1793,7 @@ def get_body(self): template += ', is_column=True' body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1799,7 +1821,7 @@ def get_body(self): template += ', is_column=True' body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1827,7 +1849,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=dict" body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1863,7 +1885,7 @@ def get_body(self): template += ", is_column=True" body = template.format( inout=self.inout, - name=self.name, + name=self.pass_as or self.name, name_converted=self.name_converted, none_acceptable=not self.required) field_check = ", field_names={0}".format( diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json index 518b863f..67951c74 100644 --- a/src/python/tools/manifest.json +++ b/src/python/tools/manifest.json @@ -97,7 +97,7 @@ "ShortName": null, "Inputs": [ { - "Name": "Model", + "Name": "Models", "Type": { "Kind": "Array", "ItemType": "PredictorModel" @@ -110,7 +110,7 @@ ], "Outputs": [ { - "Name": "OutputModel", + "Name": "OutputModels", "Type": { "Kind": "Array", "ItemType": "PredictorModel" @@ -191,8 +191,8 @@ "Desc": "Type of the items in the column", "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": "R4" }, { "Name": "Source", @@ -280,36 +280,18 @@ "Default": null }, { - "Name": "KeyRange", + "Name": "KeyCount", "Type": { "Kind": "Struct", "Fields": [ { - "Name": "Min", - "Type": "UInt", - "Desc": "First index in the range", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - }, - { - "Name": "Max", + "Name": "Count", "Type": "UInt", - "Desc": "Last index in the range", + "Desc": "Count of valid key values", "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null - }, - { - "Name": "Contiguous", - "Type": "Bool", - "Desc": "Whether the key is contiguous", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true } ] }, @@ -334,42 +316,6 @@ "IsNullable": false, "Default": null }, - { - "Name": "UseThreads", - "Type": "Bool", - "Desc": "Use separate parsing threads?", - "Aliases": [ - "threads" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "HeaderFile", - "Type": "String", - "Desc": "File containing a header with feature names. If specified, header defined in the data file (header+) is ignored.", - "Aliases": [ - "hf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "MaxRows", - "Type": "Int", - "Desc": "Maximum number of rows to produce", - "Aliases": [ - "rows" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, { "Name": "AllowQuoting", "Type": "Bool", @@ -380,7 +326,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false }, { "Name": "AllowSparse", @@ -392,7 +338,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false }, { "Name": "InputSize", @@ -446,6 +392,42 @@ "SortOrder": 150.0, "IsNullable": false, "Default": false + }, + { + "Name": "UseThreads", + "Type": "Bool", + "Desc": "Use separate parsing threads?", + "Aliases": [ + "threads" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "HeaderFile", + "Type": "String", + "Desc": "File containing a header with feature names. If specified, header defined in the data file (header+) is ignored.", + "Aliases": [ + "hf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "MaxRows", + "Type": "Int", + "Desc": "Maximum number of rows to produce", + "Aliases": [ + "rows" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null } ] }, @@ -1329,7 +1311,7 @@ "Kind": "Enum", "Values": [ "SignatureBinaryClassifierTrainer", - "SignatureMultiClassClassifierTrainer", + "SignatureMulticlassClassificationTrainer", "SignatureRankerTrainer", "SignatureRegressorTrainer", "SignatureMultiOutputRegressorTrainer", @@ -1520,7 +1502,7 @@ "Kind": "Enum", "Values": [ "SignatureBinaryClassifierTrainer", - "SignatureMultiClassClassifierTrainer", + "SignatureMulticlassClassificationTrainer", "SignatureRankerTrainer", "SignatureRegressorTrainer", "SignatureMultiOutputRegressorTrainer", @@ -2115,7 +2097,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -2136,7 +2118,7 @@ "Default": true }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -2148,7 +2130,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -2157,7 +2139,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -2186,11 +2168,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -2355,7 +2336,7 @@ "Default": true }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -2367,7 +2348,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -2379,7 +2360,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -2388,7 +2369,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -2417,11 +2398,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -2695,7 +2675,7 @@ ] }, { - "Name": "Models.RankerEvaluator", + "Name": "Models.RankingEvaluator", "Desc": "Evaluates a ranking scored dataset.", "FriendlyName": null, "ShortName": null, @@ -3197,7 +3177,7 @@ "Kind": "Enum", "Values": [ "SignatureBinaryClassifierTrainer", - "SignatureMultiClassClassifierTrainer", + "SignatureMulticlassClassificationTrainer", "SignatureRankerTrainer", "SignatureRegressorTrainer", "SignatureMultiOutputRegressorTrainer", @@ -4186,7 +4166,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -4198,7 +4178,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -4236,11 +4216,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -4307,11 +4286,12 @@ } }, { - "Name": "L2RegularizerWeight", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 Regularization Weight", "Aliases": [ - "reg" + "reg", + "L2RegularizerWeight" ], "Required": false, "SortOrder": 50.0, @@ -4324,11 +4304,12 @@ } }, { - "Name": "NumIterations", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations", "Aliases": [ - "iter" + "iter", + "numIterations" ], "Required": false, "SortOrder": 50.0, @@ -4343,11 +4324,12 @@ } }, { - "Name": "InitWtsDiameter", + "Name": "InitialWeightsDiameter", "Type": "Float", "Desc": "Init weights diameter", "Aliases": [ - "initwts" + "initwts", + "initWtsDiameter" ], "Required": false, "SortOrder": 140.0, @@ -4396,11 +4378,12 @@ "Default": null }, { - "Name": "DoLazyUpdates", + "Name": "LazyUpdate", "Type": "Bool", "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", "Aliases": [ - "lazy" + "lazy", + "DoLazyUpdates" ], "Required": false, "SortOrder": 150.0, @@ -4420,11 +4403,12 @@ "Default": 0.0 }, { - "Name": "RecencyGainMulti", + "Name": "RecencyGainMultiplicative", "Type": "Bool", "Desc": "Whether Recency Gain is multiplicative (vs. additive)", "Aliases": [ - "rgm" + "rgm", + "RecencyGainMulti" ], "Required": false, "SortOrder": 150.0, @@ -4485,18 +4469,6 @@ true ] } - }, - { - "Name": "StreamingCacheSize", - "Type": "Int", - "Desc": "Size of cache when trained in Scope", - "Aliases": [ - "cache" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000 } ], "Outputs": [ @@ -4555,7 +4527,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -4579,7 +4551,7 @@ "Default": null }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -4651,11 +4623,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -4757,7 +4728,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -4781,7 +4752,7 @@ "Default": null }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -4853,11 +4824,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -4959,7 +4929,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -4983,7 +4953,7 @@ "Default": null }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -5055,11 +5025,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -5128,7 +5097,7 @@ "ShortName": "ff", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -5159,7 +5128,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -5178,7 +5147,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -5190,9 +5159,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -5210,7 +5179,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -5222,7 +5191,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -5231,10 +5200,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -5243,7 +5212,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -5272,11 +5241,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -5286,7 +5254,7 @@ "Default": "Auto" }, { - "Name": "MaxTreeOutput", + "Name": "MaximumOutputMagnitudePerTree", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -5321,9 +5289,9 @@ "Default": 1000000 }, { - "Name": "QuantileSampleCount", + "Name": "NumberOfQuantileSamples", "Type": "Int", - "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", + "Desc": "Number of labels to be sampled from each leaf to make the distribution", "Aliases": [ "qsc" ], @@ -5350,7 +5318,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -5362,7 +5330,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -5374,7 +5342,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -5446,7 +5414,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -5458,7 +5426,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -5470,9 +5438,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -5482,9 +5450,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -5525,7 +5493,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -5597,7 +5565,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -5633,7 +5601,7 @@ "Default": 1 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -5645,7 +5613,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -5705,18 +5673,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -5779,7 +5735,7 @@ "ShortName": "ffr", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -5810,7 +5766,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -5829,7 +5785,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -5841,9 +5797,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -5861,7 +5817,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -5873,7 +5829,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -5882,10 +5838,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -5894,7 +5850,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -5923,11 +5879,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -5946,9 +5901,9 @@ "Default": false }, { - "Name": "QuantileSampleCount", + "Name": "NumberOfQuantileSamples", "Type": "Int", - "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", + "Desc": "Number of labels to be sampled from each leaf to make the distribution", "Aliases": [ "qsc" ], @@ -5975,7 +5930,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -5987,7 +5942,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -5999,7 +5954,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -6071,7 +6026,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -6083,7 +6038,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -6095,9 +6050,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -6107,9 +6062,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -6150,7 +6105,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -6222,7 +6177,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -6258,7 +6213,7 @@ "Default": 1 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -6270,7 +6225,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -6330,18 +6285,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -6404,7 +6347,7 @@ "ShortName": "ftc", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -6435,7 +6378,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -6454,7 +6397,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -6466,9 +6409,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -6486,7 +6429,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -6498,7 +6441,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -6516,7 +6459,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -6525,10 +6468,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -6537,7 +6480,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -6566,11 +6509,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -6582,7 +6524,7 @@ { "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Should we use derivatives optimized for unbalanced sets", + "Desc": "Option for using derivatives optimized for unbalanced sets", "Aliases": [ "us" ], @@ -6594,7 +6536,7 @@ { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Use best regression step trees?", + "Desc": "Option for using best regression step trees", "Aliases": [ "bsr" ], @@ -6616,7 +6558,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -6628,7 +6570,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -6683,7 +6625,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -6798,7 +6740,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -6860,7 +6802,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -6887,7 +6829,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -6899,7 +6841,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -6911,7 +6853,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -6983,7 +6925,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -6995,7 +6937,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -7007,9 +6949,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -7019,9 +6961,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -7062,7 +7004,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -7134,7 +7076,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -7170,7 +7112,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -7182,7 +7124,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -7242,18 +7184,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -7316,7 +7246,7 @@ "ShortName": "ftrank", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -7347,7 +7277,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -7366,7 +7296,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -7378,9 +7308,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -7398,7 +7328,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -7410,7 +7340,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -7428,7 +7358,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -7437,10 +7367,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -7449,7 +7379,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -7478,11 +7408,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -7493,18 +7422,27 @@ }, { "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", + "Type": { + "Kind": "Array", + "ItemType": "Float" + }, + "Desc": "Comma-separated list of gains associated to each relevance label.", "Aliases": [ "gains" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "0,3,7,15,31" + "Default": [ + 0.0, + 3.0, + 7.0, + 15.0, + 31.0 + ] }, { - "Name": "TrainDcg", + "Name": "UseDcg", "Type": "Bool", "Desc": "Train DCG instead of NDCG", "Aliases": [ @@ -7528,9 +7466,9 @@ "Default": "DescendingStablePessimistic" }, { - "Name": "LambdaMartMaxTruncation", + "Name": "NdcgTruncationLevel", "Type": "Int", - "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", + "Desc": "max-NDCG truncation to use in the LambdaMART algorithm", "Aliases": [ "n" ], @@ -7587,7 +7525,7 @@ { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Use best regression step trees?", + "Desc": "Option for using best regression step trees", "Aliases": [ "bsr" ], @@ -7609,7 +7547,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -7621,7 +7559,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -7791,7 +7729,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -7853,7 +7791,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -7880,7 +7818,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -7892,7 +7830,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -7904,7 +7842,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -7976,7 +7914,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -7988,7 +7926,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -8000,9 +7938,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -8012,9 +7950,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -8055,7 +7993,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -8127,7 +8065,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -8163,7 +8101,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -8175,7 +8113,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -8235,18 +8173,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -8309,7 +8235,7 @@ "ShortName": "ftr", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -8340,7 +8266,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -8359,7 +8285,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -8371,9 +8297,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -8391,7 +8317,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -8403,7 +8329,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -8421,7 +8347,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -8430,10 +8356,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -8442,7 +8368,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -8471,11 +8397,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -8487,7 +8412,7 @@ { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Use best regression step trees?", + "Desc": "Option for using best regression step trees", "Aliases": [ "bsr" ], @@ -8509,7 +8434,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -8521,7 +8446,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -8691,7 +8616,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -8753,7 +8678,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -8780,7 +8705,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -8792,7 +8717,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -8804,7 +8729,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -8876,7 +8801,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -8888,7 +8813,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -8900,9 +8825,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -8912,9 +8837,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -8955,7 +8880,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -9027,7 +8952,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -9063,7 +8988,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -9075,7 +9000,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -9135,18 +9060,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -9209,7 +9122,7 @@ "ShortName": "fttweedie", "Inputs": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -9240,7 +9153,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -9259,7 +9172,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -9271,9 +9184,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -9291,7 +9204,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -9303,7 +9216,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -9321,7 +9234,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -9330,10 +9243,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -9342,7 +9255,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -9371,11 +9284,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -9396,7 +9308,7 @@ { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Use best regression step trees?", + "Desc": "Option for using best regression step trees", "Aliases": [ "bsr" ], @@ -9418,7 +9330,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -9430,7 +9342,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -9485,7 +9397,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -9600,7 +9512,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -9662,7 +9574,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -9689,7 +9601,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -9701,7 +9613,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -9713,7 +9625,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -9785,7 +9697,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -9797,7 +9709,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -9809,9 +9721,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -9821,9 +9733,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -9864,7 +9776,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -9936,7 +9848,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -9972,7 +9884,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -9984,7 +9896,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -10044,18 +9956,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -10147,10 +10047,11 @@ "IsNullable": false }, { - "Name": "Iters", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of training iterations", "Aliases": [ + "iters", "iter" ], "Required": false, @@ -10164,7 +10065,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -10176,7 +10077,7 @@ "Default": "Features" }, { - "Name": "LatentDim", + "Name": "LatentDimension", "Type": "Int", "Desc": "Latent space dimension", "Aliases": [ @@ -10193,7 +10094,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -10222,6 +10123,18 @@ "IsLogScale": true } }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, { "Name": "LambdaLatent", "Type": "Float", @@ -10242,26 +10155,6 @@ }, { "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": "Auto" - }, - { - "Name": "Norm", "Type": "Bool", "Desc": "Whether to normalize the input vectors so that the concatenation of all fields' feature vectors is unit-length", "Aliases": [ @@ -10279,11 +10172,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -10292,6 +10184,21 @@ "IsNullable": false, "Default": "Auto" }, + { + "Name": "ExtraFeatureColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Extra columns to use for feature vectors. The i-th specified string denotes the column containing features form the (i+1)-th field. Note that the first field is specified by \"feat\" instead of \"exfeat\".", + "Aliases": [ + "exfeat" + ], + "Required": false, + "SortOrder": 7.0, + "IsNullable": false, + "Default": null + }, { "Name": "Shuffle", "Type": "Bool", @@ -10342,6 +10249,7 @@ } ], "InputKind": [ + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -10357,7 +10265,7 @@ "ShortName": "gam", "Inputs": [ { - "Name": "NumIterations", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Total number of iterations over all features", "Aliases": [ @@ -10388,7 +10296,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -10400,7 +10308,7 @@ "Default": "Features" }, { - "Name": "MinDocuments", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of training instances required to form a partition", "Aliases": [ @@ -10420,7 +10328,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -10432,7 +10340,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -10450,7 +10358,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -10459,7 +10367,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -10488,11 +10396,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -10538,7 +10445,7 @@ "Default": 0 }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -10562,7 +10469,7 @@ "Default": null }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -10574,7 +10481,7 @@ "Default": 255 }, { - "Name": "MaxOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single output", "Aliases": [ @@ -10598,7 +10505,7 @@ "Default": 1 }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -10658,7 +10565,7 @@ "ShortName": "gamr", "Inputs": [ { - "Name": "NumIterations", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Total number of iterations over all features", "Aliases": [ @@ -10689,7 +10596,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -10701,7 +10608,7 @@ "Default": "Features" }, { - "Name": "MinDocuments", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of training instances required to form a partition", "Aliases": [ @@ -10721,7 +10628,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -10733,7 +10640,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -10751,7 +10658,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -10760,7 +10667,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -10789,11 +10696,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -10839,7 +10745,7 @@ "Default": 0 }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -10863,7 +10769,7 @@ "Default": null }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -10875,7 +10781,7 @@ "Default": 255 }, { - "Name": "MaxOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single output", "Aliases": [ @@ -10899,7 +10805,7 @@ "Default": 1 }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -10970,7 +10876,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -10982,7 +10888,7 @@ "Default": "Features" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -10991,7 +10897,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -11020,11 +10926,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -11052,7 +10957,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", "Aliases": [ @@ -11066,13 +10971,13 @@ "Default": null }, { - "Name": "InitAlgorithm", + "Name": "InitializationAlgorithm", "Type": { "Kind": "Enum", "Values": [ "KMeansPlusPlus", "Random", - "KMeansParallel" + "KMeansYinyang" ] }, "Desc": "Cluster initialization algorithm", @@ -11082,7 +10987,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "KMeansParallel" + "Default": "KMeansYinyang" }, { "Name": "OptTol", @@ -11097,11 +11002,12 @@ "Default": 1E-07 }, { - "Name": "MaxIterations", + "Name": "MaximumNumberOfIterations", "Type": "Int", "Desc": "Maximum number of iterations.", "Aliases": [ - "maxiter" + "maxiter", + "NumberOfIterations" ], "Required": false, "SortOrder": 150.0, @@ -11144,7 +11050,7 @@ "ShortName": "LightGBM", "Inputs": [ { - "Name": "NumBoostRound", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations.", "Aliases": [ @@ -11196,7 +11102,7 @@ } }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "Maximum leaves for trees.", "Aliases": [ @@ -11215,7 +11121,7 @@ } }, { - "Name": "MinDataPerLeaf", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -11236,7 +11142,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -11262,7 +11168,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -11274,7 +11180,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -11283,10 +11189,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -11295,7 +11201,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -11324,11 +11230,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -11338,23 +11243,11 @@ "Default": "Auto" }, { - "Name": "MaxBin", - "Type": "Int", - "Desc": "Max number of bucket bin for features.", - "Aliases": [ - "mb" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 255 - }, - { - "Name": "VerboseEval", + "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Verbose", + "Desc": "Use for binary classification when training data is not balanced.", "Aliases": [ - "v" + "us" ], "Required": false, "SortOrder": 150.0, @@ -11362,41 +11255,39 @@ "Default": false }, { - "Name": "Silent", - "Type": "Bool", - "Desc": "Printing running messages.", + "Name": "WeightOfPositiveExamples", + "Type": "Float", + "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).", + "Aliases": [ + "ScalePosWeight" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 1.0 }, { - "Name": "NThread", - "Type": "Int", - "Desc": "Number of parallel threads used to run LightGBM.", + "Name": "Sigmoid", + "Type": "Float", + "Desc": "Parameter for the sigmoid function.", "Aliases": [ - "nt" + "sigmoid" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 0.5 }, { - "Name": "EvalMetric", + "Name": "EvaluationMetric", "Type": { "Kind": "Enum", "Values": [ - "DefaultMetric", - "Rmse", - "Mae", + "None", + "Default", "Logloss", "Error", - "Merror", - "Mlogloss", - "Auc", - "Ndcg", - "Map" + "AreaUnderCurve" ] }, "Desc": "Evaluation metrics.", @@ -11406,59 +11297,64 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "DefaultMetric" + "Default": "Logloss" }, { - "Name": "UseSoftmax", - "Type": "Bool", - "Desc": "Use softmax loss for the multi classification.", + "Name": "MaximumBinCountPerFeature", + "Type": "Int", + "Desc": "Maximum number of bucket bin for features.", + "Aliases": [ + "mb" + ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - true, - false - ] - } + "IsNullable": false, + "Default": 255 }, { - "Name": "EarlyStoppingRound", - "Type": "Int", - "Desc": "Rounds of early stopping, 0 will disable it.", + "Name": "Verbose", + "Type": "Bool", + "Desc": "Verbose", "Aliases": [ - "es" + "v" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": false }, { - "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", + "Name": "Silent", + "Type": "Bool", + "Desc": "Printing running messages.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "NumberOfThreads", + "Type": "Int", + "Desc": "Number of parallel threads used to run LightGBM.", "Aliases": [ - "gains" + "nt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": "0,3,7,15,31,63,127,255,511,1023,2047,4095" + "IsNullable": true, + "Default": null }, { - "Name": "Sigmoid", - "Type": "Float", - "Desc": "Parameter for the sigmoid function. Used only in LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in LightGbmRankingTrainer.", + "Name": "EarlyStoppingRound", + "Type": "Int", + "Desc": "Rounds of early stopping, 0 will disable it.", "Aliases": [ - "sigmoid" + "es" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.5 + "Default": 0 }, { "Name": "BatchSize", @@ -11470,7 +11366,7 @@ "Default": 1048576 }, { - "Name": "UseCat", + "Name": "UseCategoricalSplit", "Type": "Bool", "Desc": "Enable categorical split or not.", "Aliases": [ @@ -11489,13 +11385,13 @@ } }, { - "Name": "UseMissing", + "Name": "HandleMissingValue", "Type": "Bool", - "Desc": "Enable missing value auto infer or not.", + "Desc": "Enable special handling of missing value or not.", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, + "Default": true, "SweepRange": { "RangeType": "Discrete", "Values": [ @@ -11505,9 +11401,9 @@ } }, { - "Name": "MinDataPerGroup", + "Name": "MinimumExampleCountPerGroup", "Type": "Int", - "Desc": "Min number of instances per categorical group.", + "Desc": "Minimum number of instances per categorical group.", "Aliases": [ "mdpg" ], @@ -11530,7 +11426,7 @@ } }, { - "Name": "MaxCatThreshold", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -11555,7 +11451,7 @@ } }, { - "Name": "CatSmooth", + "Name": "CategoricalSmoothing", "Type": "Float", "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.", "Required": false, @@ -11575,7 +11471,7 @@ } }, { - "Name": "CatL2", + "Name": "L2CategoricalRegularization", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -11596,6 +11492,15 @@ ] } }, + { + "Name": "Seed", + "Type": "Int", + "Desc": "Sets the random seed for LightGBM to use.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "ParallelTrainer", "Type": { @@ -11639,7 +11544,7 @@ "ShortName": "LightGBMMC", "Inputs": [ { - "Name": "NumBoostRound", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations.", "Aliases": [ @@ -11691,7 +11596,7 @@ } }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "Maximum leaves for trees.", "Aliases": [ @@ -11710,7 +11615,7 @@ } }, { - "Name": "MinDataPerLeaf", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -11731,7 +11636,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -11757,7 +11662,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -11769,7 +11674,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -11778,10 +11683,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -11790,7 +11695,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -11819,11 +11724,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -11833,9 +11737,57 @@ "Default": "Auto" }, { - "Name": "MaxBin", + "Name": "UseSoftmax", + "Type": "Bool", + "Desc": "Use softmax loss for the multi classification.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + true, + false + ] + } + }, + { + "Name": "Sigmoid", + "Type": "Float", + "Desc": "Parameter for the sigmoid function.", + "Aliases": [ + "sigmoid" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.5 + }, + { + "Name": "EvaluationMetric", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "Default", + "Error", + "LogLoss" + ] + }, + "Desc": "Evaluation metrics.", + "Aliases": [ + "em" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "Error" + }, + { + "Name": "MaximumBinCountPerFeature", "Type": "Int", - "Desc": "Max number of bucket bin for features.", + "Desc": "Maximum number of bucket bin for features.", "Aliases": [ "mb" ], @@ -11845,7 +11797,7 @@ "Default": 255 }, { - "Name": "VerboseEval", + "Name": "Verbose", "Type": "Bool", "Desc": "Verbose", "Aliases": [ @@ -11866,7 +11818,7 @@ "Default": true }, { - "Name": "NThread", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of parallel threads used to run LightGBM.", "Aliases": [ @@ -11878,82 +11830,16 @@ "Default": null }, { - "Name": "EvalMetric", - "Type": { - "Kind": "Enum", - "Values": [ - "DefaultMetric", - "Rmse", - "Mae", - "Logloss", - "Error", - "Merror", - "Mlogloss", - "Auc", - "Ndcg", - "Map" - ] - }, - "Desc": "Evaluation metrics.", + "Name": "EarlyStoppingRound", + "Type": "Int", + "Desc": "Rounds of early stopping, 0 will disable it.", "Aliases": [ - "em" + "es" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "DefaultMetric" - }, - { - "Name": "UseSoftmax", - "Type": "Bool", - "Desc": "Use softmax loss for the multi classification.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - true, - false - ] - } - }, - { - "Name": "EarlyStoppingRound", - "Type": "Int", - "Desc": "Rounds of early stopping, 0 will disable it.", - "Aliases": [ - "es" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - }, - { - "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", - "Aliases": [ - "gains" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "0,3,7,15,31,63,127,255,511,1023,2047,4095" - }, - { - "Name": "Sigmoid", - "Type": "Float", - "Desc": "Parameter for the sigmoid function. Used only in LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in LightGbmRankingTrainer.", - "Aliases": [ - "sigmoid" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.5 + "Default": 0 }, { "Name": "BatchSize", @@ -11965,7 +11851,7 @@ "Default": 1048576 }, { - "Name": "UseCat", + "Name": "UseCategoricalSplit", "Type": "Bool", "Desc": "Enable categorical split or not.", "Aliases": [ @@ -11984,13 +11870,13 @@ } }, { - "Name": "UseMissing", + "Name": "HandleMissingValue", "Type": "Bool", - "Desc": "Enable missing value auto infer or not.", + "Desc": "Enable special handling of missing value or not.", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, + "Default": true, "SweepRange": { "RangeType": "Discrete", "Values": [ @@ -12000,9 +11886,9 @@ } }, { - "Name": "MinDataPerGroup", + "Name": "MinimumExampleCountPerGroup", "Type": "Int", - "Desc": "Min number of instances per categorical group.", + "Desc": "Minimum number of instances per categorical group.", "Aliases": [ "mdpg" ], @@ -12025,7 +11911,7 @@ } }, { - "Name": "MaxCatThreshold", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -12050,7 +11936,7 @@ } }, { - "Name": "CatSmooth", + "Name": "CategoricalSmoothing", "Type": "Float", "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.", "Required": false, @@ -12070,7 +11956,7 @@ } }, { - "Name": "CatL2", + "Name": "L2CategoricalRegularization", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -12091,6 +11977,15 @@ ] } }, + { + "Name": "Seed", + "Type": "Int", + "Desc": "Sets the random seed for LightGBM to use.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "ParallelTrainer", "Type": { @@ -12134,7 +12029,7 @@ "ShortName": "LightGBMRank", "Inputs": [ { - "Name": "NumBoostRound", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations.", "Aliases": [ @@ -12186,7 +12081,7 @@ } }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "Maximum leaves for trees.", "Aliases": [ @@ -12205,7 +12100,7 @@ } }, { - "Name": "MinDataPerLeaf", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -12226,7 +12121,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -12252,7 +12147,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -12264,7 +12159,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -12273,10 +12168,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -12285,7 +12180,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -12314,11 +12209,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -12328,9 +12222,69 @@ "Default": "Auto" }, { - "Name": "MaxBin", + "Name": "CustomGains", + "Type": { + "Kind": "Array", + "ItemType": "Int" + }, + "Desc": "An array of gains associated to each relevance label.", + "Aliases": [ + "gains" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": [ + 0, + 3, + 7, + 15, + 31, + 63, + 127, + 255, + 511, + 1023, + 2047, + 4095 + ] + }, + { + "Name": "Sigmoid", + "Type": "Float", + "Desc": "Parameter for the sigmoid function.", + "Aliases": [ + "sigmoid" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.5 + }, + { + "Name": "EvaluationMetric", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "Default", + "MeanAveragedPrecision", + "NormalizedDiscountedCumulativeGain" + ] + }, + "Desc": "Evaluation metrics.", + "Aliases": [ + "em" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "NormalizedDiscountedCumulativeGain" + }, + { + "Name": "MaximumBinCountPerFeature", "Type": "Int", - "Desc": "Max number of bucket bin for features.", + "Desc": "Maximum number of bucket bin for features.", "Aliases": [ "mb" ], @@ -12340,7 +12294,7 @@ "Default": 255 }, { - "Name": "VerboseEval", + "Name": "Verbose", "Type": "Bool", "Desc": "Verbose", "Aliases": [ @@ -12361,7 +12315,7 @@ "Default": true }, { - "Name": "NThread", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of parallel threads used to run LightGBM.", "Aliases": [ @@ -12372,48 +12326,6 @@ "IsNullable": true, "Default": null }, - { - "Name": "EvalMetric", - "Type": { - "Kind": "Enum", - "Values": [ - "DefaultMetric", - "Rmse", - "Mae", - "Logloss", - "Error", - "Merror", - "Mlogloss", - "Auc", - "Ndcg", - "Map" - ] - }, - "Desc": "Evaluation metrics.", - "Aliases": [ - "em" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "DefaultMetric" - }, - { - "Name": "UseSoftmax", - "Type": "Bool", - "Desc": "Use softmax loss for the multi classification.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - true, - false - ] - } - }, { "Name": "EarlyStoppingRound", "Type": "Int", @@ -12426,30 +12338,6 @@ "IsNullable": false, "Default": 0 }, - { - "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", - "Aliases": [ - "gains" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "0,3,7,15,31,63,127,255,511,1023,2047,4095" - }, - { - "Name": "Sigmoid", - "Type": "Float", - "Desc": "Parameter for the sigmoid function. Used only in LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in LightGbmRankingTrainer.", - "Aliases": [ - "sigmoid" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.5 - }, { "Name": "BatchSize", "Type": "Int", @@ -12460,7 +12348,7 @@ "Default": 1048576 }, { - "Name": "UseCat", + "Name": "UseCategoricalSplit", "Type": "Bool", "Desc": "Enable categorical split or not.", "Aliases": [ @@ -12479,13 +12367,13 @@ } }, { - "Name": "UseMissing", + "Name": "HandleMissingValue", "Type": "Bool", - "Desc": "Enable missing value auto infer or not.", + "Desc": "Enable special handling of missing value or not.", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, + "Default": true, "SweepRange": { "RangeType": "Discrete", "Values": [ @@ -12495,9 +12383,9 @@ } }, { - "Name": "MinDataPerGroup", + "Name": "MinimumExampleCountPerGroup", "Type": "Int", - "Desc": "Min number of instances per categorical group.", + "Desc": "Minimum number of instances per categorical group.", "Aliases": [ "mdpg" ], @@ -12520,7 +12408,7 @@ } }, { - "Name": "MaxCatThreshold", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -12545,7 +12433,7 @@ } }, { - "Name": "CatSmooth", + "Name": "CategoricalSmoothing", "Type": "Float", "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.", "Required": false, @@ -12565,7 +12453,7 @@ } }, { - "Name": "CatL2", + "Name": "L2CategoricalRegularization", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -12586,6 +12474,15 @@ ] } }, + { + "Name": "Seed", + "Type": "Int", + "Desc": "Sets the random seed for LightGBM to use.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "ParallelTrainer", "Type": { @@ -12629,7 +12526,7 @@ "ShortName": "LightGBMR", "Inputs": [ { - "Name": "NumBoostRound", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations.", "Aliases": [ @@ -12681,7 +12578,7 @@ } }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "Maximum leaves for trees.", "Aliases": [ @@ -12700,7 +12597,7 @@ } }, { - "Name": "MinDataPerLeaf", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", "Desc": "Minimum number of instances needed in a child.", "Aliases": [ @@ -12721,7 +12618,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -12747,7 +12644,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -12759,7 +12656,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -12768,10 +12665,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -12780,7 +12677,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -12809,11 +12706,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -12823,9 +12719,30 @@ "Default": "Auto" }, { - "Name": "MaxBin", + "Name": "EvaluationMetric", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "Default", + "MeanAbsoluteError", + "RootMeanSquaredError", + "MeanSquaredError" + ] + }, + "Desc": "Evaluation metrics.", + "Aliases": [ + "em" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "RootMeanSquaredError" + }, + { + "Name": "MaximumBinCountPerFeature", "Type": "Int", - "Desc": "Max number of bucket bin for features.", + "Desc": "Maximum number of bucket bin for features.", "Aliases": [ "mb" ], @@ -12835,7 +12752,7 @@ "Default": 255 }, { - "Name": "VerboseEval", + "Name": "Verbose", "Type": "Bool", "Desc": "Verbose", "Aliases": [ @@ -12856,7 +12773,7 @@ "Default": true }, { - "Name": "NThread", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of parallel threads used to run LightGBM.", "Aliases": [ @@ -12867,48 +12784,6 @@ "IsNullable": true, "Default": null }, - { - "Name": "EvalMetric", - "Type": { - "Kind": "Enum", - "Values": [ - "DefaultMetric", - "Rmse", - "Mae", - "Logloss", - "Error", - "Merror", - "Mlogloss", - "Auc", - "Ndcg", - "Map" - ] - }, - "Desc": "Evaluation metrics.", - "Aliases": [ - "em" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "DefaultMetric" - }, - { - "Name": "UseSoftmax", - "Type": "Bool", - "Desc": "Use softmax loss for the multi classification.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - true, - false - ] - } - }, { "Name": "EarlyStoppingRound", "Type": "Int", @@ -12921,30 +12796,6 @@ "IsNullable": false, "Default": 0 }, - { - "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", - "Aliases": [ - "gains" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "0,3,7,15,31,63,127,255,511,1023,2047,4095" - }, - { - "Name": "Sigmoid", - "Type": "Float", - "Desc": "Parameter for the sigmoid function. Used only in LightGbmBinaryTrainer, LightGbmMulticlassTrainer and in LightGbmRankingTrainer.", - "Aliases": [ - "sigmoid" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.5 - }, { "Name": "BatchSize", "Type": "Int", @@ -12955,7 +12806,7 @@ "Default": 1048576 }, { - "Name": "UseCat", + "Name": "UseCategoricalSplit", "Type": "Bool", "Desc": "Enable categorical split or not.", "Aliases": [ @@ -12974,13 +12825,13 @@ } }, { - "Name": "UseMissing", + "Name": "HandleMissingValue", "Type": "Bool", - "Desc": "Enable missing value auto infer or not.", + "Desc": "Enable special handling of missing value or not.", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, + "Default": true, "SweepRange": { "RangeType": "Discrete", "Values": [ @@ -12990,9 +12841,9 @@ } }, { - "Name": "MinDataPerGroup", + "Name": "MinimumExampleCountPerGroup", "Type": "Int", - "Desc": "Min number of instances per categorical group.", + "Desc": "Minimum number of instances per categorical group.", "Aliases": [ "mdpg" ], @@ -13015,7 +12866,7 @@ } }, { - "Name": "MaxCatThreshold", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Max number of categorical thresholds.", "Aliases": [ @@ -13040,7 +12891,7 @@ } }, { - "Name": "CatSmooth", + "Name": "CategoricalSmoothing", "Type": "Float", "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.", "Required": false, @@ -13060,7 +12911,7 @@ } }, { - "Name": "CatL2", + "Name": "L2CategoricalRegularization", "Type": "Float", "Desc": "L2 Regularization for categorical split.", "Required": false, @@ -13081,6 +12932,15 @@ ] } }, + { + "Name": "Seed", + "Type": "Int", + "Desc": "Sets the random seed for LightGBM to use.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "ParallelTrainer", "Type": { @@ -13135,7 +12995,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -13147,7 +13007,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -13158,6 +13018,19 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight", + "WeightColumn" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, { "Name": "NormalizeFeatures", "Type": { @@ -13185,11 +13058,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -13237,11 +13109,12 @@ } }, { - "Name": "NumIterations", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations", "Aliases": [ - "iter" + "iter", + "numIterations" ], "Required": false, "SortOrder": 50.0, @@ -13256,11 +13129,12 @@ } }, { - "Name": "InitWtsDiameter", + "Name": "InitialWeightsDiameter", "Type": "Float", "Desc": "Init weights diameter", "Aliases": [ - "initwts" + "initwts", + "initWtsDiameter" ], "Required": false, "SortOrder": 140.0, @@ -13343,18 +13217,6 @@ ] } }, - { - "Name": "StreamingCacheSize", - "Type": "Int", - "Desc": "Size of cache when trained in Scope", - "Aliases": [ - "cache" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000 - }, { "Name": "BatchSize", "Type": "Int", @@ -13402,7 +13264,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -13414,7 +13276,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -13426,7 +13288,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -13435,7 +13297,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -13464,11 +13326,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -13478,11 +13339,12 @@ "Default": "Auto" }, { - "Name": "ShowTrainingStats", + "Name": "ShowTrainingStatistics", "Type": "Bool", "Desc": "Show statistics of training examples.", "Aliases": [ - "stat" + "stat", + "ShowTrainingStats" ], "Required": false, "SortOrder": 50.0, @@ -13490,11 +13352,12 @@ "Default": false }, { - "Name": "L2Weight", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization weight", "Aliases": [ - "l2" + "l2", + "L2Weight" ], "Required": false, "SortOrder": 50.0, @@ -13508,11 +13371,12 @@ } }, { - "Name": "L1Weight", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization weight", "Aliases": [ - "l1" + "l1", + "L1Weight" ], "Required": false, "SortOrder": 50.0, @@ -13526,11 +13390,12 @@ } }, { - "Name": "OptTol", + "Name": "OptimizationTolerance", "Type": "Float", "Desc": "Tolerance parameter for optimization convergence. Low = slower, more accurate", "Aliases": [ - "ot" + "ot", + "OptTol" ], "Required": false, "SortOrder": 50.0, @@ -13545,11 +13410,12 @@ } }, { - "Name": "MemorySize", + "Name": "HistorySize", "Type": "Int", "Desc": "Memory size for L-BFGS. Low=faster, less accurate", "Aliases": [ - "m" + "m", + "MemorySize" ], "Required": false, "SortOrder": 50.0, @@ -13577,11 +13443,12 @@ "Default": false }, { - "Name": "InitWtsDiameter", + "Name": "InitialWeightsDiameter", "Type": "Float", "Desc": "Init weights diameter", "Aliases": [ - "initwts" + "initwts", + "InitWtsDiameter" ], "Required": false, "SortOrder": 140.0, @@ -13595,11 +13462,13 @@ } }, { - "Name": "MaxIterations", + "Name": "MaximumNumberOfIterations", "Type": "Int", "Desc": "Maximum iterations.", "Aliases": [ - "maxiter" + "maxiter", + "MaxIterations", + "NumberOfIterations" ], "Required": false, "SortOrder": 150.0, @@ -13612,11 +13481,12 @@ } }, { - "Name": "SgdInitializationTolerance", + "Name": "StochasticGradientDescentInitilaizationTolerance", "Type": "Float", "Desc": "Run SGD to initialize LR weights, converging to this tolerance", "Aliases": [ - "sgd" + "sgd", + "SgdInitializationTolerance" ], "Required": false, "SortOrder": 150.0, @@ -13648,11 +13518,12 @@ "Default": true }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of threads", "Aliases": [ - "nt" + "nt", + "NumThreads" ], "Required": false, "SortOrder": 150.0, @@ -13698,7 +13569,7 @@ }, { "Name": "Trainers.LogisticRegressionClassifier", - "Desc": "Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.", + "Desc": "Maximum entrypy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function.", "FriendlyName": "Multi-class Logistic Regression", "ShortName": "mlr", "Inputs": [ @@ -13714,7 +13585,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -13726,7 +13597,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -13738,7 +13609,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -13747,7 +13618,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -13776,11 +13647,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -13790,11 +13660,12 @@ "Default": "Auto" }, { - "Name": "ShowTrainingStats", + "Name": "ShowTrainingStatistics", "Type": "Bool", "Desc": "Show statistics of training examples.", "Aliases": [ - "stat" + "stat", + "ShowTrainingStats" ], "Required": false, "SortOrder": 50.0, @@ -13802,11 +13673,12 @@ "Default": false }, { - "Name": "L2Weight", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization weight", "Aliases": [ - "l2" + "l2", + "L2Weight" ], "Required": false, "SortOrder": 50.0, @@ -13820,11 +13692,12 @@ } }, { - "Name": "L1Weight", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization weight", "Aliases": [ - "l1" + "l1", + "L1Weight" ], "Required": false, "SortOrder": 50.0, @@ -13838,11 +13711,12 @@ } }, { - "Name": "OptTol", + "Name": "OptimizationTolerance", "Type": "Float", "Desc": "Tolerance parameter for optimization convergence. Low = slower, more accurate", "Aliases": [ - "ot" + "ot", + "OptTol" ], "Required": false, "SortOrder": 50.0, @@ -13857,11 +13731,12 @@ } }, { - "Name": "MemorySize", + "Name": "HistorySize", "Type": "Int", "Desc": "Memory size for L-BFGS. Low=faster, less accurate", "Aliases": [ - "m" + "m", + "MemorySize" ], "Required": false, "SortOrder": 50.0, @@ -13889,11 +13764,12 @@ "Default": false }, { - "Name": "InitWtsDiameter", + "Name": "InitialWeightsDiameter", "Type": "Float", "Desc": "Init weights diameter", "Aliases": [ - "initwts" + "initwts", + "InitWtsDiameter" ], "Required": false, "SortOrder": 140.0, @@ -13907,11 +13783,13 @@ } }, { - "Name": "MaxIterations", + "Name": "MaximumNumberOfIterations", "Type": "Int", "Desc": "Maximum iterations.", "Aliases": [ - "maxiter" + "maxiter", + "MaxIterations", + "NumberOfIterations" ], "Required": false, "SortOrder": 150.0, @@ -13924,11 +13802,12 @@ } }, { - "Name": "SgdInitializationTolerance", + "Name": "StochasticGradientDescentInitilaizationTolerance", "Type": "Float", "Desc": "Run SGD to initialize LR weights, converging to this tolerance", "Aliases": [ - "sgd" + "sgd", + "SgdInitializationTolerance" ], "Required": false, "SortOrder": 150.0, @@ -13960,11 +13839,12 @@ "Default": true }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of threads", "Aliases": [ - "nt" + "nt", + "NumThreads" ], "Required": false, "SortOrder": 150.0, @@ -14010,7 +13890,7 @@ }, { "Name": "Trainers.NaiveBayesClassifier", - "Desc": "Train a MultiClassNaiveBayesTrainer.", + "Desc": "Train a MulticlassNaiveBayesTrainer.", "FriendlyName": "Multiclass Naive Bayes", "ShortName": "MNB", "Inputs": [ @@ -14026,7 +13906,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -14038,7 +13918,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -14076,11 +13956,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -14124,7 +14003,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -14136,7 +14015,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -14174,11 +14053,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -14245,11 +14123,12 @@ } }, { - "Name": "L2RegularizerWeight", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 Regularization Weight", "Aliases": [ - "reg" + "reg", + "L2RegularizerWeight" ], "Required": false, "SortOrder": 50.0, @@ -14262,11 +14141,12 @@ } }, { - "Name": "NumIterations", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Number of iterations", "Aliases": [ - "iter" + "iter", + "numIterations" ], "Required": false, "SortOrder": 50.0, @@ -14281,11 +14161,12 @@ } }, { - "Name": "InitWtsDiameter", + "Name": "InitialWeightsDiameter", "Type": "Float", "Desc": "Init weights diameter", "Aliases": [ - "initwts" + "initwts", + "initWtsDiameter" ], "Required": false, "SortOrder": 140.0, @@ -14311,11 +14192,12 @@ "Default": null }, { - "Name": "DoLazyUpdates", + "Name": "LazyUpdate", "Type": "Bool", "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", "Aliases": [ - "lazy" + "lazy", + "DoLazyUpdates" ], "Required": false, "SortOrder": 150.0, @@ -14335,11 +14217,12 @@ "Default": 0.0 }, { - "Name": "RecencyGainMulti", + "Name": "RecencyGainMultiplicative", "Type": "Bool", "Desc": "Whether Recency Gain is multiplicative (vs. additive)", "Aliases": [ - "rgm" + "rgm", + "RecencyGainMulti" ], "Required": false, "SortOrder": 150.0, @@ -14400,18 +14283,6 @@ true ] } - }, - { - "Name": "StreamingCacheSize", - "Type": "Int", - "Desc": "Size of cache when trained in Scope", - "Aliases": [ - "cache" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000 } ], "Outputs": [ @@ -14448,7 +14319,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -14460,7 +14331,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -14472,7 +14343,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -14481,7 +14352,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -14510,11 +14381,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -14524,7 +14394,7 @@ "Default": "Auto" }, { - "Name": "L2Weight", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization weight", "Aliases": [ @@ -14544,7 +14414,7 @@ } }, { - "Name": "PerParameterSignificance", + "Name": "CalculateStatistics", "Type": "Bool", "Desc": "Whether to calculate per parameter significance statistics", "Aliases": [ @@ -14591,7 +14461,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -14603,7 +14473,7 @@ "Default": "Features" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -14612,7 +14482,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -14641,11 +14511,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -14758,7 +14627,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -14770,7 +14639,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -14782,7 +14651,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -14791,7 +14660,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -14820,11 +14689,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -14834,11 +14702,12 @@ "Default": "Auto" }, { - "Name": "L2Weight", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization weight", "Aliases": [ - "l2" + "l2", + "L2Weight" ], "Required": false, "SortOrder": 50.0, @@ -14852,11 +14721,12 @@ } }, { - "Name": "L1Weight", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization weight", "Aliases": [ - "l1" + "l1", + "L1Weight" ], "Required": false, "SortOrder": 50.0, @@ -14870,11 +14740,12 @@ } }, { - "Name": "OptTol", + "Name": "OptimizationTolerance", "Type": "Float", "Desc": "Tolerance parameter for optimization convergence. Low = slower, more accurate", "Aliases": [ - "ot" + "ot", + "OptTol" ], "Required": false, "SortOrder": 50.0, @@ -14889,11 +14760,12 @@ } }, { - "Name": "MemorySize", + "Name": "HistorySize", "Type": "Int", "Desc": "Memory size for L-BFGS. Low=faster, less accurate", "Aliases": [ - "m" + "m", + "MemorySize" ], "Required": false, "SortOrder": 50.0, @@ -14921,11 +14793,12 @@ "Default": false }, { - "Name": "InitWtsDiameter", + "Name": "InitialWeightsDiameter", "Type": "Float", "Desc": "Init weights diameter", "Aliases": [ - "initwts" + "initwts", + "InitWtsDiameter" ], "Required": false, "SortOrder": 140.0, @@ -14939,11 +14812,13 @@ } }, { - "Name": "MaxIterations", + "Name": "MaximumNumberOfIterations", "Type": "Int", "Desc": "Maximum iterations.", "Aliases": [ - "maxiter" + "maxiter", + "MaxIterations", + "NumberOfIterations" ], "Required": false, "SortOrder": 150.0, @@ -14956,11 +14831,12 @@ } }, { - "Name": "SgdInitializationTolerance", + "Name": "StochasticGradientDescentInitilaizationTolerance", "Type": "Float", "Desc": "Run SGD to initialize LR weights, converging to this tolerance", "Aliases": [ - "sgd" + "sgd", + "SgdInitializationTolerance" ], "Required": false, "SortOrder": 150.0, @@ -14992,11 +14868,12 @@ "Default": true }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Number of threads", "Aliases": [ - "nt" + "nt", + "NumThreads" ], "Required": false, "SortOrder": 150.0, @@ -15047,11 +14924,12 @@ "ShortName": "SDCA", "Inputs": [ { - "Name": "L2Const", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", "Aliases": [ - "l2" + "l2", + "L2Const" ], "Required": false, "SortOrder": 1.0, @@ -15105,7 +14983,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -15117,7 +14995,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -15128,6 +15006,18 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, { "Name": "NormalizeFeatures", "Type": { @@ -15155,11 +15045,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -15186,31 +15075,20 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", "Aliases": [ "nt", "t", - "threads" + "threads", + "NumThreads" ], "Required": false, "SortOrder": 50.0, "IsNullable": true, "Default": null }, - { - "Name": "PositiveInstanceWeight", - "Type": "Float", - "Desc": "Apply weight to the positive class, for imbalanced data", - "Aliases": [ - "piw" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 - }, { "Name": "Calibrator", "Type": { @@ -15234,6 +15112,18 @@ "IsNullable": false, "Default": 1000000 }, + { + "Name": "PositiveInstanceWeight", + "Type": "Float", + "Desc": "Apply weight to the positive class, for imbalanced data", + "Aliases": [ + "piw" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, { "Name": "ConvergenceTolerance", "Type": "Float", @@ -15256,11 +15146,13 @@ } }, { - "Name": "MaxIterations", + "Name": "MaximumNumberOfIterations", "Type": "Int", "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", "Aliases": [ - "iter" + "iter", + "MaxIterations", + "NumberOfIterations" ], "Required": false, "SortOrder": 150.0, @@ -15296,11 +15188,12 @@ } }, { - "Name": "CheckFrequency", + "Name": "ConvergenceCheckFrequency", "Type": "Int", "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", "Aliases": [ - "checkFreq" + "checkFreq", + "CheckFrequency" ], "Required": false, "SortOrder": 150.0, @@ -15337,6 +15230,7 @@ } ], "InputKind": [ + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -15352,11 +15246,12 @@ "ShortName": "sasdcamc", "Inputs": [ { - "Name": "L2Const", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", "Aliases": [ - "l2" + "l2", + "L2Const" ], "Required": false, "SortOrder": 1.0, @@ -15410,7 +15305,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -15422,7 +15317,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -15433,6 +15328,18 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, { "Name": "NormalizeFeatures", "Type": { @@ -15460,11 +15367,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -15491,13 +15397,14 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", "Aliases": [ "nt", "t", - "threads" + "threads", + "NumThreads" ], "Required": false, "SortOrder": 50.0, @@ -15526,11 +15433,13 @@ } }, { - "Name": "MaxIterations", + "Name": "MaximumNumberOfIterations", "Type": "Int", "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", "Aliases": [ - "iter" + "iter", + "MaxIterations", + "NumberOfIterations" ], "Required": false, "SortOrder": 150.0, @@ -15566,11 +15475,12 @@ } }, { - "Name": "CheckFrequency", + "Name": "ConvergenceCheckFrequency", "Type": "Int", "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", "Aliases": [ - "checkFreq" + "checkFreq", + "CheckFrequency" ], "Required": false, "SortOrder": 150.0, @@ -15607,6 +15517,7 @@ } ], "InputKind": [ + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -15622,11 +15533,12 @@ "ShortName": "sasdcar", "Inputs": [ { - "Name": "L2Const", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", "Aliases": [ - "l2" + "l2", + "L2Const" ], "Required": false, "SortOrder": 1.0, @@ -15680,7 +15592,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -15692,7 +15604,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -15703,6 +15615,18 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, { "Name": "NormalizeFeatures", "Type": { @@ -15730,11 +15654,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -15761,13 +15684,14 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", "Aliases": [ "nt", "t", - "threads" + "threads", + "NumThreads" ], "Required": false, "SortOrder": 50.0, @@ -15796,11 +15720,13 @@ } }, { - "Name": "MaxIterations", + "Name": "MaximumNumberOfIterations", "Type": "Int", "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", "Aliases": [ - "iter" + "iter", + "MaxIterations", + "NumberOfIterations" ], "Required": false, "SortOrder": 150.0, @@ -15836,11 +15762,12 @@ } }, { - "Name": "CheckFrequency", + "Name": "ConvergenceCheckFrequency", "Type": "Int", "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", "Aliases": [ - "checkFreq" + "checkFreq", + "CheckFrequency" ], "Required": false, "SortOrder": 150.0, @@ -15877,6 +15804,7 @@ } ], "InputKind": [ + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -15903,7 +15831,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -15915,7 +15843,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -15927,7 +15855,7 @@ "Default": "Label" }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -15936,7 +15864,7 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { "Name": "NormalizeFeatures", @@ -15965,11 +15893,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -15996,11 +15923,12 @@ } }, { - "Name": "L2Weight", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 Regularization constant", "Aliases": [ - "l2" + "l2", + "L2Weight" ], "Required": false, "SortOrder": 50.0, @@ -16018,19 +15946,43 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.", "Aliases": [ "nt", "t", - "threads" + "threads", + "NumThreads" ], "Required": false, "SortOrder": 50.0, "IsNullable": true, "Default": null }, + { + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, + { + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 + }, { "Name": "ConvergenceTolerance", "Type": "Float", @@ -16053,11 +16005,12 @@ } }, { - "Name": "MaxIterations", + "Name": "NumberOfIterations", "Type": "Int", "Desc": "Maximum number of iterations; set to 1 to simulate online learning.", "Aliases": [ - "iter" + "iter", + "MaxIterations" ], "Required": false, "SortOrder": 150.0, @@ -16074,12 +16027,13 @@ } }, { - "Name": "InitLearningRate", + "Name": "InitialLearningRate", "Type": "Float", "Desc": "Initial learning rate (only used by SGD)", "Aliases": [ "ilr", - "lr" + "lr", + "InitLearningRate" ], "Required": false, "SortOrder": 150.0, @@ -16128,29 +16082,6 @@ "SortOrder": 150.0, "IsNullable": true, "Default": null - }, - { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } - }, - { - "Name": "MaxCalibrationExamples", - "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000 } ], "Outputs": [ @@ -16188,7 +16119,7 @@ "IsNullable": false }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -16200,7 +16131,7 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -16238,11 +16169,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -16706,9 +16636,9 @@ "Kind": "Enum", "Values": [ "Bag", - "Ind", + "Indicator", "Key", - "Bin" + "Binary" ] }, "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", @@ -16721,7 +16651,7 @@ "Default": null }, { - "Name": "HashBits", + "Name": "NumberOfBits", "Type": "Int", "Desc": "The number of bits to hash into. Must be between 1 and 30, inclusive.", "Aliases": [ @@ -16754,7 +16684,7 @@ "Default": null }, { - "Name": "InvertHash", + "Name": "MaximumNumberOfInverts", "Type": "Int", "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", "Aliases": [ @@ -16792,7 +16722,7 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:hashBits:src)", + "Desc": "New column definition(s) (optional form: name:numberOfBits:src)", "Aliases": [ "col" ], @@ -16809,7 +16739,7 @@ "IsNullable": false }, { - "Name": "HashBits", + "Name": "NumberOfBits", "Type": "Int", "Desc": "Number of bits to hash into. Must be between 1 and 30, inclusive.", "Aliases": [ @@ -16826,9 +16756,9 @@ "Kind": "Enum", "Values": [ "Bag", - "Ind", + "Indicator", "Key", - "Bin" + "Binary" ] }, "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", @@ -16862,7 +16792,7 @@ "Default": true }, { - "Name": "InvertHash", + "Name": "MaximumNumberOfInverts", "Type": "Int", "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", "Aliases": [ @@ -16912,9 +16842,9 @@ "Kind": "Enum", "Values": [ "Bag", - "Ind", + "Indicator", "Key", - "Bin" + "Binary" ] }, "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector", @@ -16955,8 +16885,8 @@ "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "ByOccurrence", + "ByValue" ] }, "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", @@ -17023,7 +16953,7 @@ { "Name": "MaxNumTerms", "Type": "Int", - "Desc": "Maximum number of terms to keep per column when auto-training", + "Desc": "Maximum number of keys to keep per column when auto-training", "Aliases": [ "max" ], @@ -17038,9 +16968,9 @@ "Kind": "Enum", "Values": [ "Bag", - "Ind", + "Indicator", "Key", - "Bin" + "Binary" ] }, "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", @@ -17050,7 +16980,7 @@ "Required": false, "SortOrder": 102.0, "IsNullable": false, - "Default": "Ind" + "Default": "Indicator" }, { "Name": "Term", @@ -17069,15 +16999,15 @@ "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "ByOccurrence", + "ByValue" ] }, "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", "Required": false, "SortOrder": 113.0, "IsNullable": false, - "Default": "Occurrence" + "Default": "ByOccurrence" }, { "Name": "TextKeyValues", @@ -17785,62 +17715,21 @@ "maxtrain" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000000 - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ] - }, - { - "Name": "Transforms.DataCache", - "Desc": "Caches using the specified cache option.", - "FriendlyName": "Cache Data", - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Memory", - "Disk" - ] - }, - "Desc": "Caching strategy", - "Required": true, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Memory" + "Default": 1000000000 } ], "Outputs": [ { "Name": "OutputData", "Type": "DataView", - "Desc": "Dataset" + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" } ], "InputKind": [ @@ -17970,8 +17859,8 @@ "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "ByOccurrence", + "ByValue" ] }, "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", @@ -18039,7 +17928,7 @@ { "Name": "MaxNumTerms", "Type": "Int", - "Desc": "Maximum number of terms to keep per column when auto-training", + "Desc": "Maximum number of keys to keep per column when auto-training", "Aliases": [ "max" ], @@ -18065,15 +17954,15 @@ "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "ByOccurrence", + "ByValue" ] }, "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", "Required": false, "SortOrder": 113.0, "IsNullable": false, - "Default": "Occurrence" + "Default": "ByOccurrence" }, { "Name": "TextKeyValues", @@ -18336,7 +18225,8 @@ "IsNullable": false }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", + "PassAs": "LabelColumn", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -18545,7 +18435,7 @@ "Default": null }, { - "Name": "HashBits", + "Name": "NumberOfBits", "Type": "Int", "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", "Aliases": [ @@ -18621,7 +18511,7 @@ "IsNullable": false }, { - "Name": "HashBits", + "Name": "NumberOfBits", "Type": "Int", "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", "Aliases": [ @@ -18908,12 +18798,28 @@ "Default": null }, { - "Name": "InterleaveArgb", + "Name": "Order", + "Type": { + "Kind": "Enum", + "Values": [ + "ARGB", + "ARBG", + "ABRG", + "ABGR", + "AGRB", + "AGBR" + ] + }, + "Desc": "Order of channels", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Interleave", "Type": "Bool", - "Desc": "Whether to separate each channel or interleave in ARGB order", - "Aliases": [ - "interleave" - ], + "Desc": "Whether to separate each channel or interleave in specified order", "Required": false, "SortOrder": 150.0, "IsNullable": true, @@ -19041,12 +18947,28 @@ "Default": true }, { - "Name": "InterleaveArgb", + "Name": "Order", + "Type": { + "Kind": "Enum", + "Values": [ + "ARGB", + "ARBG", + "ABRG", + "ABGR", + "AGRB", + "AGBR" + ] + }, + "Desc": "Order of colors.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "ARGB" + }, + { + "Name": "Interleave", "Type": "Bool", - "Desc": "Whether to separate each channel or interleave in ARGB order", - "Aliases": [ - "interleave" - ], + "Desc": "Whether to separate each channel or interleave in specified order", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -19145,7 +19067,8 @@ "Kind": "Enum", "Values": [ "IsoPad", - "IsoCrop" + "IsoCrop", + "Fill" ] }, "Desc": "Resizing method", @@ -19251,7 +19174,8 @@ "Kind": "Enum", "Values": [ "IsoPad", - "IsoCrop" + "IsoCrop", + "Fill" ] }, "Desc": "Resizing method", @@ -20083,14 +20007,14 @@ "Kind": "Struct", "Fields": [ { - "Name": "NormKind", + "Name": "Norm", "Type": { "Kind": "Enum", "Values": [ - "L2Norm", - "StdDev", - "L1Norm", - "LInf" + "L2", + "StandardDeviation", + "L1", + "Infinity" ] }, "Desc": "The norm to use to normalize each sample", @@ -20147,14 +20071,14 @@ "IsNullable": false }, { - "Name": "NormKind", + "Name": "Norm", "Type": { "Kind": "Enum", "Values": [ - "L2Norm", - "StdDev", - "L1Norm", - "LInf" + "L2", + "StandardDeviation", + "L1", + "Infinity" ] }, "Desc": "The norm to use to normalize each sample", @@ -20164,7 +20088,7 @@ "Required": false, "SortOrder": 1.0, "IsNullable": false, - "Default": "L2Norm" + "Default": "L2" }, { "Name": "Data", @@ -21044,7 +20968,7 @@ }, { "Name": "Transforms.NGramTranslator", - "Desc": "Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.", + "Desc": "Produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.", "FriendlyName": "NGram Transform", "ShortName": "NgramTransform", "Inputs": [ @@ -21058,7 +20982,7 @@ { "Name": "NgramLength", "Type": "Int", - "Desc": "Maximum ngram length", + "Desc": "Maximum n-gram length", "Aliases": [ "ngram" ], @@ -21070,7 +20994,7 @@ { "Name": "AllLengths", "Type": "Bool", - "Desc": "Whether to include all ngram lengths up to NgramLength or only NgramLength", + "Desc": "Whether to include all n-gram lengths up to NgramLength or only NgramLength", "Aliases": [ "all" ], @@ -21082,7 +21006,7 @@ { "Name": "SkipLength", "Type": "Int", - "Desc": "Maximum number of tokens to skip when constructing an ngram", + "Desc": "Maximum number of tokens to skip when constructing an n-gram", "Aliases": [ "skips" ], @@ -21097,7 +21021,7 @@ "Kind": "Array", "ItemType": "Int" }, - "Desc": "Maximum number of ngrams to store in the dictionary", + "Desc": "Maximum number of n-grams to store in the dictionary", "Aliases": [ "max" ], @@ -21169,7 +21093,7 @@ { "Name": "NgramLength", "Type": "Int", - "Desc": "Maximum ngram length", + "Desc": "Maximum n-gram length", "Aliases": [ "ngram" ], @@ -21181,7 +21105,7 @@ { "Name": "AllLengths", "Type": "Bool", - "Desc": "Whether to store all ngram lengths up to ngramLength, or only ngramLength", + "Desc": "Whether to store all n-gram lengths up to ngramLength, or only ngramLength", "Aliases": [ "all" ], @@ -21193,7 +21117,7 @@ { "Name": "SkipLength", "Type": "Int", - "Desc": "Maximum number of tokens to skip when constructing an ngram", + "Desc": "Maximum number of tokens to skip when constructing an n-gram", "Aliases": [ "skips" ], @@ -21208,7 +21132,7 @@ "Kind": "Array", "ItemType": "Int" }, - "Desc": "Maximum number of ngrams to store in the dictionary", + "Desc": "Maximum number of n-grams to store in the dictionary", "Aliases": [ "max" ], @@ -21454,7 +21378,7 @@ "IsNullable": false }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "The name of the weight column", "Aliases": [ @@ -21922,7 +21846,7 @@ { "Name": "Transforms.ScoreColumnSelector", "Desc": "Selects only the last score columns and the extra columns specified in the arguments.", - "FriendlyName": "Choose Columns By Index", + "FriendlyName": "Choose Columns By Indices", "ShortName": null, "Inputs": [ { @@ -22286,6 +22210,15 @@ "SortOrder": 15.0, "IsNullable": false, "Default": false + }, + { + "Name": "AddBatchDimensionInputs", + "Type": "Bool", + "Desc": "Add a batch dimension to the input e.g. input = [224, 224, 3] => [-1, 224, 224, 3].", + "Required": false, + "SortOrder": 16.0, + "IsNullable": false, + "Default": false } ], "Outputs": [ @@ -22309,7 +22242,7 @@ }, { "Name": "Transforms.TextFeaturizer", - "Desc": "A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.", + "Desc": "A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) n-grams in a given tokenized text.", "FriendlyName": "Text Transform", "ShortName": "Text", "Inputs": [ @@ -22387,16 +22320,19 @@ "Default": "English" }, { - "Name": "UsePredefinedStopWordRemover", - "Type": "Bool", - "Desc": "Use stop remover or not.", + "Name": "StopWordsRemover", + "Type": { + "Kind": "Component", + "ComponentKind": "StopWordsRemover" + }, + "Desc": "Stopwords remover.", "Aliases": [ "remover" ], "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": false + "Default": null }, { "Name": "TextCase", @@ -22454,9 +22390,9 @@ "Default": true }, { - "Name": "OutputTokens", - "Type": "Bool", - "Desc": "Whether to output the transformed text tokens as an additional column.", + "Name": "OutputTokensColumnName", + "Type": "String", + "Desc": "Column containing the transformed text tokens.", "Aliases": [ "tokens", "showtext", @@ -22465,7 +22401,7 @@ "Required": false, "SortOrder": 9.0, "IsNullable": false, - "Default": false + "Default": null }, { "Name": "Dictionary", @@ -22489,15 +22425,15 @@ "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "ByOccurrence", + "ByValue" ] }, "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "Occurrence" + "Default": "ByOccurrence" }, { "Name": "DropUnknowns", @@ -22576,7 +22512,7 @@ "None", "L1", "L2", - "LInf" + "Infinity" ] }, "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm.", @@ -22650,8 +22586,8 @@ "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "ByOccurrence", + "ByValue" ] }, "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", @@ -22719,7 +22655,7 @@ { "Name": "MaxNumTerms", "Type": "Int", - "Desc": "Maximum number of terms to keep per column when auto-training", + "Desc": "Maximum number of keys to keep per column when auto-training", "Aliases": [ "max" ], @@ -22745,15 +22681,15 @@ "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "ByOccurrence", + "ByValue" ] }, "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", "Required": false, "SortOrder": 113.0, "IsNullable": false, - "Default": "Occurrence" + "Default": "ByOccurrence" }, { "Name": "TextKeyValues", @@ -22996,12 +22932,28 @@ "Default": null }, { - "Name": "InterleaveArgb", + "Name": "Order", + "Type": { + "Kind": "Enum", + "Values": [ + "ARGB", + "ARBG", + "ABRG", + "ABGR", + "AGRB", + "AGBR" + ] + }, + "Desc": "Order of channels", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Interleave", "Type": "Bool", - "Desc": "Whether to separate each channel or interleave in ARGB order", - "Aliases": [ - "interleave" - ], + "Desc": "Whether to separate each channel or interleave in specified order", "Required": false, "SortOrder": 150.0, "IsNullable": true, @@ -23049,6 +23001,42 @@ "IsNullable": true, "Default": null }, + { + "Name": "DefaultAlpha", + "Type": "Int", + "Desc": "Default value for alpha channel. Will be used if ContainsAlpha set to false", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "DefaultRed", + "Type": "Int", + "Desc": "Default value for red channel. Will be used if ContainsRed set to false", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "DefaultGreen", + "Type": "Int", + "Desc": "Default value for green channel. Will be used if ContainsGreen set to false", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "DefaultBlue", + "Type": "Int", + "Desc": "Default value for blue channel. Will be used if ContainsGreen set to false", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "Name", "Type": "String", @@ -23141,12 +23129,28 @@ "Default": true }, { - "Name": "InterleaveArgb", + "Name": "Order", + "Type": { + "Kind": "Enum", + "Values": [ + "ARGB", + "ARBG", + "ABRG", + "ABGR", + "AGRB", + "AGBR" + ] + }, + "Desc": "Order of colors.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "ARGB" + }, + { + "Name": "Interleave", "Type": "Bool", - "Desc": "Whether to separate each channel or interleave in ARGB order", - "Aliases": [ - "interleave" - ], + "Desc": "Whether to separate each channel or interleave in specified order", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23182,8 +23186,8 @@ "Desc": "Offset (pre-scale)", "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 0.0 }, { "Name": "Scale", @@ -23191,8 +23195,44 @@ "Desc": "Scale factor", "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "DefaultAlpha", + "Type": "Int", + "Desc": "Default value for alpha channel. Will be used if ContainsAlpha set to false", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 255 + }, + { + "Name": "DefaultRed", + "Type": "Int", + "Desc": "Default value for red channel. Will be used if ContainsRed set to false", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "DefaultGreen", + "Type": "Int", + "Desc": "Default value for green channel. Will be used if ContainsGreen set to false", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "DefaultBlue", + "Type": "Int", + "Desc": "Default value for blue channel. Will be used if ContainsBlue set to false", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 } ], "Outputs": [ @@ -23276,7 +23316,7 @@ "GloVeTwitter100D", "GloVeTwitter200D", "FastTextWikipedia300D", - "Sswe" + "SentimentSpecificWordEmbedding" ] }, "Desc": "Pre-trained model used to create the vocabulary", @@ -23286,7 +23326,7 @@ "Required": false, "SortOrder": 1.0, "IsNullable": true, - "Default": "Sswe" + "Default": "SentimentSpecificWordEmbedding" }, { "Name": "Data", @@ -23443,9 +23483,9 @@ "FriendlyName": "Tree Dropout Tree Booster", "Settings": [ { - "Name": "DropRate", + "Name": "TreeDropFraction", "Type": "Float", - "Desc": "Drop ratio for trees. Range:(0,1).", + "Desc": "The drop ratio for trees. Range:(0,1).", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23456,9 +23496,9 @@ } }, { - "Name": "MaxDrop", + "Name": "MaximumNumberOfDroppedTreesPerRound", "Type": "Int", - "Desc": "Max number of dropped tree in a boosting round.", + "Desc": "Maximum number of dropped trees in a boosting round.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23469,9 +23509,9 @@ } }, { - "Name": "SkipDrop", + "Name": "SkipDropFraction", "Type": "Float", - "Desc": "Probability for not perform dropping in a boosting round.", + "Desc": "Probability for not dropping in a boosting round.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23500,19 +23540,7 @@ "Default": false }, { - "Name": "UnbalancedSets", - "Type": "Bool", - "Desc": "Use for binary classification when classes are not balanced.", - "Aliases": [ - "us" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "MinSplitGain", + "Name": "MinimumSplitGain", "Type": "Float", "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.", "Required": false, @@ -23524,7 +23552,7 @@ } }, { - "Name": "MaxDepth", + "Name": "MaximumTreeDepth", "Type": "Int", "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.", "Required": false, @@ -23537,7 +23565,7 @@ } }, { - "Name": "MinChildWeight", + "Name": "MinimumChildWeight", "Type": "Float", "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.", "Required": false, @@ -23549,9 +23577,9 @@ } }, { - "Name": "SubsampleFreq", + "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at every N iterations.This must be set with Subsample as this specifies the amount to subsample.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23562,7 +23590,7 @@ } }, { - "Name": "Subsample", + "Name": "SubsampleFraction", "Type": "Float", "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].", "Required": false, @@ -23591,7 +23619,7 @@ } }, { - "Name": "RegLambda", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ @@ -23614,7 +23642,7 @@ } }, { - "Name": "RegAlpha", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ @@ -23635,15 +23663,6 @@ 1.0 ] } - }, - { - "Name": "ScalePosWeight", - "Type": "Float", - "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 } ] }, @@ -23653,19 +23672,7 @@ "FriendlyName": "Tree Booster", "Settings": [ { - "Name": "UnbalancedSets", - "Type": "Bool", - "Desc": "Use for binary classification when classes are not balanced.", - "Aliases": [ - "us" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "MinSplitGain", + "Name": "MinimumSplitGain", "Type": "Float", "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.", "Required": false, @@ -23677,7 +23684,7 @@ } }, { - "Name": "MaxDepth", + "Name": "MaximumTreeDepth", "Type": "Int", "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.", "Required": false, @@ -23690,7 +23697,7 @@ } }, { - "Name": "MinChildWeight", + "Name": "MinimumChildWeight", "Type": "Float", "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.", "Required": false, @@ -23702,9 +23709,9 @@ } }, { - "Name": "SubsampleFreq", + "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at every N iterations.This must be set with Subsample as this specifies the amount to subsample.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23715,7 +23722,7 @@ } }, { - "Name": "Subsample", + "Name": "SubsampleFraction", "Type": "Float", "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].", "Required": false, @@ -23744,7 +23751,7 @@ } }, { - "Name": "RegLambda", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ @@ -23767,7 +23774,7 @@ } }, { - "Name": "RegAlpha", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ @@ -23788,15 +23795,6 @@ 1.0 ] } - }, - { - "Name": "ScalePosWeight", - "Type": "Float", - "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 } ] }, @@ -23832,19 +23830,7 @@ } }, { - "Name": "UnbalancedSets", - "Type": "Bool", - "Desc": "Use for binary classification when classes are not balanced.", - "Aliases": [ - "us" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "MinSplitGain", + "Name": "MinimumSplitGain", "Type": "Float", "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.", "Required": false, @@ -23856,7 +23842,7 @@ } }, { - "Name": "MaxDepth", + "Name": "MaximumTreeDepth", "Type": "Int", "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.", "Required": false, @@ -23869,7 +23855,7 @@ } }, { - "Name": "MinChildWeight", + "Name": "MinimumChildWeight", "Type": "Float", "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.", "Required": false, @@ -23881,9 +23867,9 @@ } }, { - "Name": "SubsampleFreq", + "Name": "SubsampleFrequency", "Type": "Int", - "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.", + "Desc": "Subsample frequency for bagging. 0 means no subsample. Specifies the frequency at which the bagging occurs, where if this is set to N, the subsampling will happen at every N iterations.This must be set with Subsample as this specifies the amount to subsample.", "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -23894,7 +23880,7 @@ } }, { - "Name": "Subsample", + "Name": "SubsampleFraction", "Type": "Float", "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].", "Required": false, @@ -23923,7 +23909,7 @@ } }, { - "Name": "RegLambda", + "Name": "L2Regularization", "Type": "Float", "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.", "Aliases": [ @@ -23946,7 +23932,7 @@ } }, { - "Name": "RegAlpha", + "Name": "L1Regularization", "Type": "Float", "Desc": "L1 regularization term on weights, increase this value will make model more conservative.", "Aliases": [ @@ -23967,15 +23953,6 @@ 1.0 ] } - }, - { - "Name": "ScalePosWeight", - "Type": "Float", - "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 } ] } @@ -24973,7 +24950,7 @@ "FriendlyName": "FastTree (Boosted Trees) Classification", "Settings": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -25004,7 +24981,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -25023,7 +25000,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -25035,9 +25012,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -25055,7 +25032,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -25067,7 +25044,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -25085,7 +25062,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -25094,10 +25071,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -25106,7 +25083,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -25135,11 +25112,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -25151,7 +25127,7 @@ { "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Should we use derivatives optimized for unbalanced sets", + "Desc": "Option for using derivatives optimized for unbalanced sets", "Aliases": [ "us" ], @@ -25163,7 +25139,7 @@ { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Use best regression step trees?", + "Desc": "Option for using best regression step trees", "Aliases": [ "bsr" ], @@ -25185,7 +25161,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -25197,7 +25173,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -25252,7 +25228,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -25367,7 +25343,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -25429,7 +25405,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -25456,7 +25432,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -25468,7 +25444,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -25480,7 +25456,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -25552,7 +25528,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -25564,7 +25540,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -25576,9 +25552,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -25588,9 +25564,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -25631,7 +25607,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -25703,7 +25679,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -25739,7 +25715,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -25751,7 +25727,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -25811,18 +25787,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -25867,7 +25831,7 @@ "FriendlyName": "FastTree (Boosted Trees) Ranking", "Settings": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -25898,7 +25862,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -25917,7 +25881,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -25929,9 +25893,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -25949,7 +25913,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -25961,7 +25925,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -25979,7 +25943,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -25988,10 +25952,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -26000,7 +25964,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -26029,11 +25993,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -26044,18 +26007,27 @@ }, { "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", + "Type": { + "Kind": "Array", + "ItemType": "Float" + }, + "Desc": "Comma-separated list of gains associated to each relevance label.", "Aliases": [ "gains" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "0,3,7,15,31" + "Default": [ + 0.0, + 3.0, + 7.0, + 15.0, + 31.0 + ] }, { - "Name": "TrainDcg", + "Name": "UseDcg", "Type": "Bool", "Desc": "Train DCG instead of NDCG", "Aliases": [ @@ -26079,9 +26051,9 @@ "Default": "DescendingStablePessimistic" }, { - "Name": "LambdaMartMaxTruncation", + "Name": "NdcgTruncationLevel", "Type": "Int", - "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", + "Desc": "max-NDCG truncation to use in the LambdaMART algorithm", "Aliases": [ "n" ], @@ -26138,7 +26110,7 @@ { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Use best regression step trees?", + "Desc": "Option for using best regression step trees", "Aliases": [ "bsr" ], @@ -26160,7 +26132,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -26172,7 +26144,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -26342,7 +26314,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -26404,7 +26376,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -26431,7 +26403,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -26443,7 +26415,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -26455,7 +26427,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -26527,7 +26499,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -26539,7 +26511,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -26551,9 +26523,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -26563,9 +26535,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -26606,7 +26578,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -26678,7 +26650,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -26714,7 +26686,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -26726,7 +26698,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -26786,18 +26758,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -26842,7 +26802,7 @@ "FriendlyName": "FastTree (Boosted Trees) Regression", "Settings": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -26873,7 +26833,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -26892,7 +26852,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -26904,9 +26864,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -26924,7 +26884,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -26936,7 +26896,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -26954,7 +26914,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -26963,10 +26923,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -26975,7 +26935,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -27004,11 +26964,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -27020,7 +26979,7 @@ { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Use best regression step trees?", + "Desc": "Option for using best regression step trees", "Aliases": [ "bsr" ], @@ -27042,7 +27001,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -27054,7 +27013,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -27224,7 +27183,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -27286,7 +27245,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -27313,7 +27272,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -27325,7 +27284,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -27337,7 +27296,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -27409,7 +27368,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -27421,7 +27380,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -27433,9 +27392,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -27445,9 +27404,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -27488,7 +27447,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -27560,7 +27519,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -27596,7 +27555,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -27608,7 +27567,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -27668,18 +27627,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -27724,7 +27671,7 @@ "FriendlyName": "FastTree (Boosted Trees) Tweedie Regression", "Settings": [ { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Type": "Int", "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ @@ -27755,7 +27702,7 @@ "IsNullable": false }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Type": "Int", "Desc": "The max number of leaves in each regression tree", "Aliases": [ @@ -27774,7 +27721,7 @@ } }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Type": "String", "Desc": "Column to use for features", "Aliases": [ @@ -27786,9 +27733,9 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "MinimumExampleCountPerLeaf", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "The minimal number of examples allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ "mil" ], @@ -27806,7 +27753,7 @@ } }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -27818,7 +27765,7 @@ "Default": "Label" }, { - "Name": "LearningRates", + "Name": "LearningRate", "Type": "Float", "Desc": "The learning rate", "Aliases": [ @@ -27836,7 +27783,7 @@ } }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Type": "String", "Desc": "Column to use for example weight", "Aliases": [ @@ -27845,10 +27792,10 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": null }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Type": "String", "Desc": "Column to use for example groupId", "Aliases": [ @@ -27857,7 +27804,7 @@ "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": null }, { "Name": "NormalizeFeatures", @@ -27886,11 +27833,10 @@ "Values": [ "Auto", "Memory", - "Disk", "None" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Whether trainer should cache input training data", "Aliases": [ "cache" ], @@ -27911,7 +27857,7 @@ { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Use best regression step trees?", + "Desc": "Option for using best regression step trees", "Aliases": [ "bsr" ], @@ -27933,7 +27879,7 @@ "Default": false }, { - "Name": "NumPostBracketSteps", + "Name": "MaximumNumberOfLineSearchSteps", "Type": "Int", "Desc": "Number of post-bracket line search steps", "Aliases": [ @@ -27945,7 +27891,7 @@ "Default": 0 }, { - "Name": "MinStepSize", + "Name": "MinimumStepSize", "Type": "Float", "Desc": "Minimum line search step size", "Aliases": [ @@ -28000,7 +27946,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -28115,7 +28061,7 @@ "Default": false }, { - "Name": "MaxTreeOutput", + "Name": "MaximumTreeOutput", "Type": "Float", "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ @@ -28177,7 +28123,7 @@ { "Name": "PositionDiscountFreeform", "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Desc": "The discount freeform which specifies the per position discounts of examples in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ "pdff" ], @@ -28204,7 +28150,7 @@ } }, { - "Name": "NumThreads", + "Name": "NumberOfThreads", "Type": "Int", "Desc": "The number of threads to use", "Aliases": [ @@ -28216,7 +28162,7 @@ "Default": null }, { - "Name": "RngSeed", + "Name": "Seed", "Type": "Int", "Desc": "The seed of the random number generator", "Aliases": [ @@ -28228,7 +28174,7 @@ "Default": 123 }, { - "Name": "FeatureSelectSeed", + "Name": "FeatureSelectionSeed", "Type": "Int", "Desc": "The seed of the active feature selection", "Aliases": [ @@ -28300,7 +28246,7 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaximumCategoricalGroupCountPerNode", "Type": "Int", "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ @@ -28312,7 +28258,7 @@ "Default": 64 }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "MaximumCategoricalSplitPointCount", "Type": "Int", "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ @@ -28324,9 +28270,9 @@ "Default": 64 }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "MinimumExampleFractionForCategoricalSplit", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Minimum categorical example percentage in a bin to consider for a split.", "Aliases": [ "mdop" ], @@ -28336,9 +28282,9 @@ "Default": 0.001 }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinimumExamplesForCategoricalSplit", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Minimum categorical example count in a bin to consider for a split.", "Aliases": [ "mdo" ], @@ -28379,7 +28325,7 @@ "Default": "None" }, { - "Name": "MaxBins", + "Name": "MaximumBinCountPerFeature", "Type": "Int", "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ @@ -28451,7 +28397,7 @@ "Default": 0.0 }, { - "Name": "ExecutionTimes", + "Name": "ExecutionTime", "Type": "Bool", "Desc": "Print execution time breakdown to stdout", "Aliases": [ @@ -28487,7 +28433,7 @@ "Default": 0 }, { - "Name": "BaggingTrainFraction", + "Name": "BaggingExampleFraction", "Type": "Float", "Desc": "Percentage of training examples used in each bag", "Aliases": [ @@ -28499,7 +28445,7 @@ "Default": 0.7 }, { - "Name": "SplitFraction", + "Name": "FeatureFractionPerSplit", "Type": "Float", "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ @@ -28559,18 +28505,6 @@ "IsNullable": false, "Default": false }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, { "Name": "PrintTestGraph", "Type": "Bool", @@ -28638,7 +28572,7 @@ { "Name": "SkipLength", "Type": "Int", - "Desc": "Maximum number of tokens to skip when constructing an ngram", + "Desc": "Maximum number of tokens to skip when constructing an n-gram", "Aliases": [ "skips" ], @@ -28650,7 +28584,7 @@ { "Name": "AllLengths", "Type": "Bool", - "Desc": "Whether to include all ngram lengths up to NgramLength or only NgramLength", + "Desc": "Whether to include all n-gram lengths up to NgramLength or only NgramLength", "Aliases": [ "all" ], @@ -28665,7 +28599,7 @@ "Kind": "Array", "ItemType": "Int" }, - "Desc": "Maximum number of ngrams to store in the dictionary", + "Desc": "Maximum number of n-grams to store in the dictionary", "Aliases": [ "max" ], @@ -28704,7 +28638,7 @@ ], "Settings": [ { - "Name": "HashBits", + "Name": "NumberOfBits", "Type": "Int", "Desc": "Number of bits to hash into. Must be between 1 and 30, inclusive.", "Aliases": [ @@ -28730,7 +28664,7 @@ { "Name": "SkipLength", "Type": "Int", - "Desc": "Maximum number of tokens to skip when constructing an ngram", + "Desc": "Maximum number of tokens to skip when constructing an n-gram", "Aliases": [ "skips" ], @@ -28742,7 +28676,7 @@ { "Name": "AllLengths", "Type": "Bool", - "Desc": "Whether to include all ngram lengths up to ngramLength or only ngramLength", + "Desc": "Whether to include all n-gram lengths up to ngramLength or only ngramLength", "Aliases": [ "all" ], @@ -28773,7 +28707,7 @@ "Default": true }, { - "Name": "InvertHash", + "Name": "MaximumNumberOfInverts", "Type": "Int", "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", "Aliases": [ diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index 786dac97..c19aad98 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -47,7 +47,7 @@ "NewName": "GainConfLevel" }, { - "Name": "InitWtsDiameter", + "Name": "InitialWeightsDiameter", "Desc": "Sets the initial weights diameter that specifies the range from which values are drawn for the initial weights. These weights are initialized randomly from within this range. For example, if the diameter is specified to be ``d``, then the weights are uniformly distributed between ``-d/2`` and ``d/2``. The default value is ``0``, which specifies that all the weights are set to zero." }, { @@ -55,8 +55,7 @@ "NewName": "L2Weight" }, { - "Name": "LearningRates", - "NewName": "LearningRate", + "Name": "LearningRate", "Desc": "Determines the size of the step taken in the direction of the gradient in each step of the learning process. This determines how fast or slow the learner converges on the optimal solution. If the step size is too big, you might overshoot the optimal solution. If the step size is too small, training takes longer to converge to the best solution." }, { @@ -67,13 +66,16 @@ "Name": "MaxBins", "NewName": "NumBins" }, + { + "Name": "HistorySize", + "Desc": "Memory size for L-BFGS. Lower=faster, less accurate. The technique used for optimization here is L-BFGS, which uses only a limited amount of memory to compute the next step direction. This parameter indicates the number of past positions and gradients to store for the computation of the next step. Must be greater than or equal to ``1``" + }, { "Name": "MemorySize", "Desc": "Memory size for L-BFGS. Lower=faster, less accurate. The technique used for optimization here is L-BFGS, which uses only a limited amount of memory to compute the next step direction. This parameter indicates the number of past positions and gradients to store for the computation of the next step. Must be greater than or equal to ``1``" }, { - "Name": "MinDocumentsInLeafs", - "NewName": "MinSplit", + "Name": "MinimumExampleCountPerLeaf", "Desc": "Minimum number of training instances required to form a leaf. That is, the minimal number of documents allowed in a leaf of regression tree, out of the sub-sampled data. A 'split' means that features in each level of the tree (node) are randomly divided." }, { @@ -82,15 +84,15 @@ "Desc": "If ``Auto``, the choice to normalize depends on the preference declared by the algorithm. This is the default choice. If ``No``, no normalization is performed. If ``Yes``, normalization always performed. If ``Warn``, if normalization is needed by the algorithm, a warning message is displayed but normalization is not performed. If normalization is performed, a ``MaxMin`` normalizer is used. This normalizer preserves sparsity by mapping zero to zero." }, { - "Name": "NumLeaves", + "Name": "NumberOfLeaves", "Desc": "The maximum number of leaves (terminal nodes) that can be created in any tree. Higher values potentially increase the size of the tree and get better precision, but risk overfitting and requiring longer training times." }, { "Name": "NumThreads", - "NewName": "TrainThreads" + "NewName": "NumberOfThreads" }, { - "Name": "NumTrees", + "Name": "NumberOfTrees", "Desc": "Specifies the total number of decision trees to create in the ensemble. By creating more decision trees, you can potentially get better coverage, but the training time increases." }, { @@ -127,19 +129,19 @@ "Hidden": true }, { - "Name": "FeatureColumn", + "Name": "FeatureColumnName", "Hidden": true }, { - "Name": "LabelColumn", + "Name": "LabelColumnName", "Hidden": true }, { - "Name": "WeightColumn", + "Name": "ExampleWeightColumnName", "Hidden": true }, { - "Name": "GroupIdColumn", + "Name": "RowGroupColumnName", "Hidden": true }, { @@ -298,8 +300,14 @@ "NewName": "FactorizationMachineBinaryClassifier", "Module": "decomposition", "Type": "Classifier", - "Predict_Proba" : true, - "Decision_Function" : true + "Predict_Proba": true, + "Decision_Function": true, + "Inputs": [ + { + "Name": "NormalizeFeatures", + "Desc": "Whether to normalize the input vectors so that the concatenation of all fields' feature vectors is unit-length" + } + ] }, { "Name": "Trainers.FastForestBinaryClassifier", @@ -307,23 +315,13 @@ "Module": "ensemble", "Type": "Classifier", "Predict_Proba" : true, - "Decision_Function" : true, - "Inputs": [{ - "Name": "MinDocsPercentageForCategoricalSplit", - "NewName": "min_docs_percentage_split" - } - ] + "Decision_Function" : true }, { "Name": "Trainers.FastForestRegressor", "NewName": "FastForestRegressor", "Module": "ensemble", - "Type": "Regressor", - "Inputs": [{ - "Name": "MinDocsPercentageForCategoricalSplit", - "NewName": "min_docs_percentage_split" - } - ] + "Type": "Regressor" }, { "Name": "Trainers.FastTreeBinaryClassifier", @@ -333,10 +331,6 @@ "Predict_Proba" : true, "Decision_Function" : true, "Inputs": [{ - "Name": "MinDocsPercentageForCategoricalSplit", - "NewName": "min_docs_percentage_split" - }, - { "Name": "BestStepRankingRegressionTrees", "NewName": "best_step_trees" } @@ -348,10 +342,6 @@ "Module": "ensemble", "Type": "Regressor", "Inputs": [{ - "Name": "MinDocsPercentageForCategoricalSplit", - "NewName": "min_docs_percentage_split" - }, - { "Name": "BestStepRankingRegressionTrees", "NewName": "best_step_trees" } @@ -366,7 +356,7 @@ "Decision_Function" : true, "Inputs": [ { - "Name": "MaxOutput", + "Name": "MaximumTreeOutput", "Default": "float('inf')" } ] @@ -378,7 +368,7 @@ "Type": "Regressor", "Inputs": [ { - "Name": "MaxOutput", + "Name": "MaximumTreeOutput", "Default": "float('inf')" } ] @@ -618,7 +608,13 @@ "Name": "Transforms.RowRangeFilter", "NewName": "RangeFilter", "Module": "preprocessing.filter", - "Type": "Transform" + "Type": "Transform", + "Inputs": [ + { + "Name": "Min", + "Default": -1 + } + ] }, { "Name": "Transforms.RowSkipFilter", @@ -696,10 +692,6 @@ "Module": "ensemble", "Type": "Regressor", "Inputs": [{ - "Name": "MinDocsPercentageForCategoricalSplit", - "NewName": "min_docs_percentage_split" - }, - { "Name": "BestStepRankingRegressionTrees", "NewName": "best_step_trees" } diff --git a/version.txt b/version.txt index bcaffe19..afaf360d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.7.0 \ No newline at end of file +1.0.0 \ No newline at end of file