diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index 1395c998..26e5a84d 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -17,6 +17,7 @@ using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Trainers.LightGbm; using Microsoft.ML.Transforms; +using Microsoft.ML.TimeSeries; namespace Microsoft.MachineLearning.DotNetBridge { @@ -328,6 +329,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd //env.ComponentCatalog.RegisterAssembly(typeof(SaveOnnxCommand).Assembly); //env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly); //env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(ForecastExtensions).Assembly); using (var ch = host.Start("Executing")) { diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index 92365878..fab49e2e 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -41,5 +41,6 @@ + diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index 7491fac8..e75aa8f3 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -20,6 +20,7 @@ + diff --git a/src/python/docs/sphinx/make.bat b/src/python/docs/sphinx/make.bat index e79eca32..50b8b4ee 100644 --- a/src/python/docs/sphinx/make.bat +++ b/src/python/docs/sphinx/make.bat @@ -9,11 +9,13 @@ REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=%PYTHONINTERPRETER% -m sphinx ) + +:: Todo: Fix the issue here, the installtion guide is not showing correctly set SOURCEDIR=. set BUILDDIR=%~dp0_build set SPHINXPROJ=microsoftml -if "%1" == "" goto html: +if "%1" == "" goto html: set format=%1 goto next: @@ -52,4 +54,4 @@ goto end: @echo An issue happened. Check %BUILDDIR%\%format% is not here. :end -popd +popd \ No newline at end of file diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 48076cc7..d4b65307 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -88,6 +88,9 @@ + + + @@ -109,6 +112,7 @@ + @@ -134,6 +138,9 @@ + + + @@ -159,6 +166,7 @@ + @@ -181,7 +189,6 @@ - @@ -219,6 +226,11 @@ + + + + + @@ -569,6 +581,16 @@ + + + + + + + + + + @@ -741,6 +763,7 @@ + @@ -762,6 +785,7 @@ + @@ -778,6 +802,7 @@ + diff --git a/src/python/nimbusml/examples/IidChangePointDetector.py b/src/python/nimbusml/examples/IidChangePointDetector.py new file mode 100644 index 00000000..d9b4fdb1 --- /dev/null +++ b/src/python/nimbusml/examples/IidChangePointDetector.py @@ -0,0 +1,38 @@ +############################################################################### +# IidChangePointDetector +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import IidChangePointDetector + +# data input (as a FileDataStream) +path = get_dataset('timeseries').as_filepath() + +data = FileDataStream.read_csv(path) +print(data.head()) +# t1 t2 t3 +# 0 0.01 0.01 0.0100 +# 1 0.02 0.02 0.0200 +# 2 0.03 0.03 0.0200 +# 3 0.03 0.03 0.0250 +# 4 0.03 0.03 0.0005 + +# define the training pipeline +pipeline = Pipeline([ + IidChangePointDetector(columns={'t2_cp': 't2'}, change_history_length=4) +]) + +result = pipeline.fit_transform(data) +print(result) + +# t1 t2 t3 t2_cp.Alert t2_cp.Raw Score t2_cp.P-Value Score t2_cp.Martingale Score +# 0 0.01 0.01 0.0100 0.0 0.01 5.000000e-01 1.212573e-03 +# 1 0.02 0.02 0.0200 0.0 0.02 4.960106e-01 1.221347e-03 +# 2 0.03 0.03 0.0200 0.0 0.03 1.139087e-02 3.672914e-02 +# 3 0.03 0.03 0.0250 0.0 0.03 2.058296e-01 8.164447e-02 +# 4 0.03 0.03 0.0005 0.0 0.03 2.804577e-01 1.373786e-01 +# 5 0.03 0.05 0.0100 1.0 0.05 1.448886e-06 1.315014e+04 +# 6 0.05 0.07 0.0500 0.0 0.07 2.616611e-03 4.941587e+04 +# 7 0.07 0.09 0.0900 0.0 0.09 3.053187e-02 2.752614e+05 +# 8 0.09 99.00 99.0000 0.0 99.00 1.000000e-08 1.389396e+12 +# 9 1.10 0.10 0.1000 1.0 0.10 3.778296e-01 1.854344e+07 + diff --git a/src/python/nimbusml/examples/IidSpikeDetector.py b/src/python/nimbusml/examples/IidSpikeDetector.py new file mode 100644 index 00000000..f21e138e --- /dev/null +++ b/src/python/nimbusml/examples/IidSpikeDetector.py @@ -0,0 +1,37 @@ +############################################################################### +# IidSpikeDetector +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import IidSpikeDetector + +# data input (as a FileDataStream) +path = get_dataset('timeseries').as_filepath() + +data = FileDataStream.read_csv(path) +print(data.head()) +# t1 t2 t3 +# 0 0.01 0.01 0.0100 +# 1 0.02 0.02 0.0200 +# 2 0.03 0.03 0.0200 +# 3 0.03 0.03 0.0250 +# 4 0.03 0.03 0.0005 + +# define the training pipeline +pipeline = Pipeline([ + IidSpikeDetector(columns={'t2_spikes': 't2'}, pvalue_history_length=5) +]) + +result = pipeline.fit_transform(data) +print(result) +# t1 t2 t3 t2_spikes.Alert t2_spikes.Raw Score t2_spikes.P-Value Score +# 0 0.01 0.01 0.0100 0.0 0.01 5.000000e-01 +# 1 0.02 0.02 0.0200 0.0 0.02 4.960106e-01 +# 2 0.03 0.03 0.0200 0.0 0.03 1.139087e-02 +# 3 0.03 0.03 0.0250 0.0 0.03 2.058296e-01 +# 4 0.03 0.03 0.0005 0.0 0.03 2.804577e-01 +# 5 0.03 0.05 0.0100 1.0 0.05 3.743552e-03 +# 6 0.05 0.07 0.0500 1.0 0.07 4.136079e-03 +# 7 0.07 0.09 0.0900 0.0 0.09 2.242496e-02 +# 8 0.09 99.00 99.0000 1.0 99.00 1.000000e-08 +# 9 1.10 0.10 0.1000 0.0 0.10 4.015681e-01 + diff --git a/src/python/nimbusml/examples/SsaChangePointDetector.py b/src/python/nimbusml/examples/SsaChangePointDetector.py new file mode 100644 index 00000000..2f002135 --- /dev/null +++ b/src/python/nimbusml/examples/SsaChangePointDetector.py @@ -0,0 +1,40 @@ +############################################################################### +# SsaChangePointDetector +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import SsaChangePointDetector + +# data input (as a FileDataStream) +path = get_dataset('timeseries').as_filepath() + +data = FileDataStream.read_csv(path) +print(data.head()) +# t1 t2 t3 +# 0 0.01 0.01 0.0100 +# 1 0.02 0.02 0.0200 +# 2 0.03 0.03 0.0200 +# 3 0.03 0.03 0.0250 +# 4 0.03 0.03 0.0005 + +# define the training pipeline +pipeline = Pipeline([ + SsaChangePointDetector(columns={'t2_cp': 't2'}, + change_history_length=4, + training_window_size=8, + seasonal_window_size=3) +]) + +result = pipeline.fit_transform(data) +print(result) + +# t1 t2 t3 t2_cp.Alert t2_cp.Raw Score t2_cp.P-Value Score t2_cp.Martingale Score +# 0 0.01 0.01 0.0100 0.0 -0.111334 5.000000e-01 0.001213 +# 1 0.02 0.02 0.0200 0.0 -0.076755 4.862075e-01 0.001243 +# 2 0.03 0.03 0.0200 0.0 -0.034871 3.856320e-03 0.099119 +# 3 0.03 0.03 0.0250 0.0 -0.012559 8.617091e-02 0.482400 +# 4 0.03 0.03 0.0005 0.0 -0.015723 2.252377e-01 0.988788 +# 5 0.03 0.05 0.0100 0.0 -0.001133 1.767711e-01 2.457946 +# 6 0.05 0.07 0.0500 0.0 0.006265 9.170460e-02 0.141898 +# 7 0.07 0.09 0.0900 0.0 0.002383 2.701134e-01 0.050747 +# 8 0.09 99.00 99.0000 1.0 98.879520 1.000000e-08 210274.372059 +# 9 1.10 0.10 0.1000 0.0 -57.817568 6.635692e-02 507877.454862 diff --git a/src/python/nimbusml/examples/SsaSpikeDetector.py b/src/python/nimbusml/examples/SsaSpikeDetector.py new file mode 100644 index 00000000..299c4475 --- /dev/null +++ b/src/python/nimbusml/examples/SsaSpikeDetector.py @@ -0,0 +1,40 @@ +############################################################################### +# SsaSpikeDetector +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import SsaSpikeDetector + +# data input (as a FileDataStream) +path = get_dataset('timeseries').as_filepath() + +data = FileDataStream.read_csv(path) +print(data.head()) +# t1 t2 t3 +# 0 0.01 0.01 0.0100 +# 1 0.02 0.02 0.0200 +# 2 0.03 0.03 0.0200 +# 3 0.03 0.03 0.0250 +# 4 0.03 0.03 0.0005 + +# define the training pipeline +pipeline = Pipeline([ + SsaSpikeDetector(columns={'t2_spikes': 't2'}, + pvalue_history_length=4, + training_window_size=8, + seasonal_window_size=3) +]) + +result = pipeline.fit_transform(data) +print(result) + +# t1 t2 t3 t2_spikes.Alert t2_spikes.Raw Score t2_spikes.P-Value Score +# 0 0.01 0.01 0.0100 0.0 -0.111334 5.000000e-01 +# 1 0.02 0.02 0.0200 0.0 -0.076755 4.862075e-01 +# 2 0.03 0.03 0.0200 0.0 -0.034871 3.856320e-03 +# 3 0.03 0.03 0.0250 0.0 -0.012559 8.617091e-02 +# 4 0.03 0.03 0.0005 0.0 -0.015723 2.252377e-01 +# 5 0.03 0.05 0.0100 0.0 -0.001133 1.767711e-01 +# 6 0.05 0.07 0.0500 0.0 0.006265 9.170460e-02 +# 7 0.07 0.09 0.0900 0.0 0.002383 2.701134e-01 +# 8 0.09 99.00 99.0000 1.0 98.879520 1.000000e-08 +# 9 1.10 0.10 0.1000 0.0 -57.817568 6.635692e-02 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/IidChangePointDetector_df.py b/src/python/nimbusml/examples/examples_from_dataframe/IidChangePointDetector_df.py new file mode 100644 index 00000000..00d19531 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/IidChangePointDetector_df.py @@ -0,0 +1,34 @@ +############################################################################### +# IidChangePointDetector +import pandas as pd +from nimbusml.timeseries import IidChangePointDetector + +# Create a sample series with a change +input_data = [5, 5, 5, 5, 5, 5, 5, 5] +input_data.extend([7, 7, 7, 7, 7, 7, 7, 7]) + +X_train = pd.Series(input_data, name="ts") + +cpd = IidChangePointDetector(confidence=95, change_history_length=4) << {'result': 'ts'} +data = cpd.fit_transform(X_train) + +print(data) + +# ts result.Alert result.Raw Score result.P-Value Score result.Martingale Score +# 0 5 0.0 5.0 5.000000e-01 0.001213 +# 1 5 0.0 5.0 5.000000e-01 0.001213 +# 2 5 0.0 5.0 5.000000e-01 0.001213 +# 3 5 0.0 5.0 5.000000e-01 0.001213 +# 4 5 0.0 5.0 5.000000e-01 0.001213 +# 5 5 0.0 5.0 5.000000e-01 0.001213 +# 6 5 0.0 5.0 5.000000e-01 0.001213 +# 7 5 0.0 5.0 5.000000e-01 0.001213 +# 8 7 1.0 7.0 1.000000e-08 10298.666376 <-- alert is on, predicted changepoint +# 9 7 0.0 7.0 1.328455e-01 33950.164799 +# 10 7 0.0 7.0 2.613750e-01 60866.342063 +# 11 7 0.0 7.0 3.776152e-01 78362.038772 +# 12 7 0.0 7.0 5.000000e-01 0.009226 +# 13 7 0.0 7.0 5.000000e-01 0.002799 +# 14 7 0.0 7.0 5.000000e-01 0.001561 +# 15 7 0.0 7.0 5.000000e-01 0.001213 + diff --git a/src/python/nimbusml/examples/examples_from_dataframe/IidSpikeDetector_df.py b/src/python/nimbusml/examples/examples_from_dataframe/IidSpikeDetector_df.py new file mode 100644 index 00000000..93ab346d --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/IidSpikeDetector_df.py @@ -0,0 +1,26 @@ +############################################################################### +# IidSpikeDetector +import pandas as pd +from nimbusml.timeseries import IidSpikeDetector + +X_train = pd.Series([5, 5, 5, 5, 5, 10, 5, 5, 5, 5, 5], name="ts") + +isd = IidSpikeDetector(confidence=95, pvalue_history_length=2.5) << {'result': 'ts'} + +isd.fit(X_train, verbose=1) +data = isd.transform(X_train) + +print(data) + +# ts result.Alert result.Raw Score result.P-Value Score +# 0 5.0 0.0 5.0 5.000000e-01 +# 1 5.0 0.0 5.0 5.000000e-01 +# 2 5.0 0.0 5.0 5.000000e-01 +# 3 5.0 0.0 5.0 5.000000e-01 +# 4 5.0 0.0 5.0 5.000000e-01 +# 5 10.0 1.0 10.0 1.000000e-08 <-- alert is on, predicted spike +# 6 5.0 0.0 5.0 2.613750e-01 +# 7 5.0 0.0 5.0 2.613750e-01 +# 8 5.0 0.0 5.0 5.000000e-01 +# 9 5.0 0.0 5.0 5.000000e-01 +# 10 5.0 0.0 5.0 5.000000e-01 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/SsaChangePointDetector_df.py b/src/python/nimbusml/examples/examples_from_dataframe/SsaChangePointDetector_df.py new file mode 100644 index 00000000..9bea570e --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/SsaChangePointDetector_df.py @@ -0,0 +1,77 @@ +############################################################################### +# SsaChangePointDetector +import numpy as np +import pandas as pd +from nimbusml.timeseries import SsaChangePointDetector + +# This example creates a time series (list of data with the +# i-th element corresponding to the i-th time slot). +# The estimator is applied to identify points where data distribution changed. +# This estimator can account for temporal seasonality in the data. + +# Generate sample series data with a recurring +# pattern and a spike within the pattern +seasonality_size = 5 +seasonal_data = np.arange(seasonality_size) + +data = np.tile(seasonal_data, 3) +data = np.append(data, [0, 100, 200, 300, 400]) # change distribution + +X_train = pd.Series(data, name="ts") + +# X_train looks like this +# 0 0 +# 1 1 +# 2 2 +# 3 3 +# 4 4 +# 5 0 +# 6 1 +# 7 2 +# 8 3 +# 9 4 +# 10 0 +# 11 1 +# 12 2 +# 13 3 +# 14 4 +# 15 0 +# 16 100 +# 17 200 +# 18 300 +# 19 400 + +training_seasons = 3 +training_size = seasonality_size * training_seasons + +cpd = SsaChangePointDetector(confidence=95, + change_history_length=8, + training_window_size=training_size, + seasonal_window_size=seasonality_size + 1) << {'result': 'ts'} + +cpd.fit(X_train, verbose=1) +data = cpd.transform(X_train) + +print(data) + +# ts result.Alert result.Raw Score result.P-Value Score result.Martingale Score +# 0 0 0.0 -2.531824 5.000000e-01 1.470334e-06 +# 1 1 0.0 -0.008832 5.818072e-03 8.094459e-05 +# 2 2 0.0 0.763040 1.374071e-01 2.588526e-04 +# 3 3 0.0 0.693811 2.797713e-01 4.365186e-04 +# 4 4 0.0 1.442079 1.838294e-01 1.074242e-03 +# 5 0 0.0 -1.844414 1.707238e-01 2.825599e-03 +# 6 1 0.0 0.219578 4.364025e-01 3.193633e-03 +# 7 2 0.0 0.201708 4.505472e-01 3.507451e-03 +# 8 3 0.0 0.157089 4.684456e-01 3.719387e-03 +# 9 4 0.0 1.329494 1.773046e-01 1.717610e-04 +# 10 0 0.0 -1.792391 7.353794e-02 3.014897e-04 +# 11 1 0.0 0.161634 4.999295e-01 1.788041e-04 +# 12 2 0.0 0.092626 4.953789e-01 7.326680e-05 +# 13 3 0.0 0.084648 4.514174e-01 3.053876e-05 +# 14 4 0.0 1.305554 1.202619e-01 9.741702e-05 +# 15 0 0.0 -1.792391 7.264402e-02 5.034093e-04 +# 16 100 1.0 99.161634 1.000000e-08 4.031944e+03 <-- alert is on, predicted change point +# 17 200 0.0 185.229474 5.485437e-04 7.312609e+05 +# 18 300 0.0 270.403543 1.259683e-02 3.578470e+06 +# 19 400 0.0 357.113747 2.978766e-02 4.529837e+07 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/SsaSpikeDetector_df.py b/src/python/nimbusml/examples/examples_from_dataframe/SsaSpikeDetector_df.py new file mode 100644 index 00000000..d1297d09 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/SsaSpikeDetector_df.py @@ -0,0 +1,80 @@ +############################################################################### +# SsaSpikeDetector +import numpy as np +import pandas as pd +from nimbusml.timeseries import SsaSpikeDetector + +# This example creates a time series (list of data with the +# i-th element corresponding to the i-th time slot). +# The estimator is applied to identify spiking points in the series. +# This estimator can account for temporal seasonality in the data. + +# Generate sample series data with a recurring +# pattern and a spike within the pattern +seasonality_size = 5 +seasonal_data = np.arange(seasonality_size) + +data = np.tile(seasonal_data, 3) +data = np.append(data, [100]) # add a spike +data = np.append(data, seasonal_data) + +X_train = pd.Series(data, name="ts") + +# X_train looks like this +# 0 0 +# 1 1 +# 2 2 +# 3 3 +# 4 4 +# 5 0 +# 6 1 +# 7 2 +# 8 3 +# 9 4 +# 10 0 +# 11 1 +# 12 2 +# 13 3 +# 14 4 +# 15 100 +# 16 0 +# 17 1 +# 18 2 +# 19 3 +# 20 4 + +training_seasons = 3 +training_size = seasonality_size * training_seasons + +ssd = SsaSpikeDetector(confidence=95, + pvalue_history_length=8, + training_window_size=training_size, + seasonal_window_size=seasonality_size + 1) << {'result': 'ts'} + +ssd.fit(X_train, verbose=1) +data = ssd.transform(X_train) + +print(data) + +# ts result.Alert result.Raw Score result.P-Value Score +# 0 0 0.0 -2.531824 5.000000e-01 +# 1 1 0.0 -0.008832 5.818072e-03 +# 2 2 0.0 0.763040 1.374071e-01 +# 3 3 0.0 0.693811 2.797713e-01 +# 4 4 0.0 1.442079 1.838294e-01 +# 5 0 0.0 -1.844414 1.707238e-01 +# 6 1 0.0 0.219578 4.364025e-01 +# 7 2 0.0 0.201708 4.505472e-01 +# 8 3 0.0 0.157089 4.684456e-01 +# 9 4 0.0 1.329494 1.773046e-01 +# 10 0 0.0 -1.792391 7.353794e-02 +# 11 1 0.0 0.161634 4.999295e-01 +# 12 2 0.0 0.092626 4.953789e-01 +# 13 3 0.0 0.084648 4.514174e-01 +# 14 4 0.0 1.305554 1.202619e-01 +# 15 100 1.0 98.207609 1.000000e-08 <-- alert is on, predicted spike +# 16 0 0.0 -13.831450 2.912225e-01 +# 17 1 0.0 -1.741884 4.379857e-01 +# 18 2 0.0 -0.465426 4.557261e-01 +# 19 3 0.0 -16.497133 2.926521e-01 +# 20 4 0.0 -29.817375 2.060473e-01 diff --git a/src/python/nimbusml/internal/core/timeseries/__init__.py b/src/python/nimbusml/internal/core/timeseries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/nimbusml/internal/core/timeseries/_iidchangepointdetector.py b/src/python/nimbusml/internal/core/timeseries/_iidchangepointdetector.py new file mode 100644 index 00000000..ae874a1c --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/_iidchangepointdetector.py @@ -0,0 +1,107 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +IidChangePointDetector +""" + +__all__ = ["IidChangePointDetector"] + + +from ...entrypoints.timeseriesprocessingentrypoints_iidchangepointdetector import \ + timeseriesprocessingentrypoints_iidchangepointdetector +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class IidChangePointDetector(BasePipelineItem, DefaultSignature): + """ + + This transform detects the change-points in an i.i.d. sequence using + adaptive kernel density estimation and martingales. + + .. remarks:: + ``IIDChangePointDetector`` assumes a sequence of data points that are + independently sampled from one + stationary distribution. `Adaptive kernel density estimation + `_ + is used to model the distribution. + + This transform detects + change points by calculating the martingale score for the sliding + window based on the estimated distribution. + The idea is based on the `Exchangeability + Martingales `_ that + detects a change of distribution over a stream of i.i.d. values. In + short, the value of the + martingale score starts increasing significantly when a sequence of + small p-values are detected in a row; this + indicates the change of the distribution of the underlying data + generation process. + + :param confidence: The confidence for change point detection in the range + [0, 100]. Used to set the threshold of the martingale score for + triggering alert. + + :param change_history_length: The length of the sliding window on p-value + for computing the martingale score. + + :param martingale: The type of martingale betting function used for + computing the martingale score. Available options are {``Power``, + ``Mixture``}. + + :param power_martingale_epsilon: The epsilon parameter for the Power + martingale if martingale is set to ``Power``. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:func:`IIDSpikeDetector + `, + :py:func:`SsaSpikeDetector + `, + :py:func:`SsaChangePointDetector + `. + + .. index:: models, timeseries, transform + + Example: + .. literalinclude:: + /../nimbusml/examples/IidSpikeChangePointDetector.py + :language: python + """ + + @trace + def __init__( + self, + confidence=95.0, + change_history_length=20, + martingale='Power', + power_martingale_epsilon=0.1, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.confidence = confidence + self.change_history_length = change_history_length + self.martingale = martingale + self.power_martingale_epsilon = power_martingale_epsilon + + @property + def _entrypoint(self): + return timeseriesprocessingentrypoints_iidchangepointdetector + + @trace + def _get_node(self, **all_args): + algo_args = dict( + source=self.source, + name=self._name_or_source, + confidence=self.confidence, + change_history_length=self.change_history_length, + martingale=self.martingale, + power_martingale_epsilon=self.power_martingale_epsilon) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/timeseries/_iidspikedetector.py b/src/python/nimbusml/internal/core/timeseries/_iidspikedetector.py new file mode 100644 index 00000000..00712d77 --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/_iidspikedetector.py @@ -0,0 +1,91 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +IidSpikeDetector +""" + +__all__ = ["IidSpikeDetector"] + + +from ...entrypoints.timeseriesprocessingentrypoints_iidspikedetector import \ + timeseriesprocessingentrypoints_iidspikedetector +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class IidSpikeDetector(BasePipelineItem, DefaultSignature): + """ + + This transform detects the spikes in a i.i.d. sequence using adaptive + kernel density estimation. + + .. remarks:: + ``IIDSpikeDetector`` assumes a sequence of data points that are + independently sampled from one stationary + distribution. `Adaptive kernel density estimation + `_ + is used to model the distribution. + The `p-value score + indicates the likelihood of the current observation according to + the estimated distribution. The lower its value, the more likely the + current point is an outlier. + + :param confidence: The confidence for spike detection in the range [0, + 100]. + + :param side: The argument that determines whether to detect positive or + negative anomalies, or both. Available options are {``Positive``, + ``Negative``, ``TwoSided``}. + + :param pvalue_history_length: The size of the sliding window for computing + the p-value. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:func:`IIDChangePointDetector + `, + :py:func:`SsaSpikeDetector + `, + :py:func:`SsaChangePointDetector + `. + + .. index:: models, timeseries, transform + + Example: + .. literalinclude:: /../nimbusml/examples/IidSpikePointDetector.py + :language: python + """ + + @trace + def __init__( + self, + confidence=99.0, + side='TwoSided', + pvalue_history_length=100, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.confidence = confidence + self.side = side + self.pvalue_history_length = pvalue_history_length + + @property + def _entrypoint(self): + return timeseriesprocessingentrypoints_iidspikedetector + + @trace + def _get_node(self, **all_args): + algo_args = dict( + source=self.source, + name=self._name_or_source, + confidence=self.confidence, + side=self.side, + pvalue_history_length=self.pvalue_history_length) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/timeseries/_ssachangepointdetector.py b/src/python/nimbusml/internal/core/timeseries/_ssachangepointdetector.py new file mode 100644 index 00000000..297fae42 --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/_ssachangepointdetector.py @@ -0,0 +1,138 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +SsaChangePointDetector +""" + +__all__ = ["SsaChangePointDetector"] + + +from ...entrypoints.timeseriesprocessingentrypoints_ssachangepointdetector import \ + timeseriesprocessingentrypoints_ssachangepointdetector +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class SsaChangePointDetector(BasePipelineItem, DefaultSignature): + """ + + This transform detects the change-points in a seasonal time-series + using Singular Spectrum Analysis (SSA). + + .. remarks:: + `Singular Spectrum Analysis (SSA) + `_ is a + powerful framework for decomposing the time-series into trend, + seasonality and noise components as well as forecasting the future + values of the time-series. In order to remove the + effect of such components on anomaly detection, this transform add + SSA as a time-series modeler component in the detection pipeline. + + The SSA component will be trained and it predicts the next expected + value on the time-series under normal condition; this expected value + is + further used to calculate the amount of deviation from the normal + behavior at that timestamp. + The distribution of this deviation is then modeled using `Adaptive + kernel density estimation + `_. + + This transform detects + change points by calculating the martingale score for the sliding + window based on the estimated distribution of deviations. + The idea is based on the `Exchangeability + Martingales `_ that + detects a change of distribution over a stream of i.i.d. values. In + short, the value of the + martingale score starts increasing significantly when a sequence of + small p-values detected in a row; this + indicates the change of the distribution of the underlying data + generation process. + + :param training_window_size: The number of points, N, from the beginning + of the sequence used to train the SSA model. + + :param confidence: The confidence for change point detection in the range + [0, 100]. + + :param seasonal_window_size: An upper bound, L, on the largest relevant + seasonality in the input time-series, which also + determines the order of the autoregression of SSA. It must satisfy 2 + < L < N/2. + + :param change_history_length: The length of the sliding window on p-value + for computing the martingale score. + + :param error_function: The function used to compute the error between the + expected and the observed value. Possible values are: + {``SignedDifference``, ``AbsoluteDifference``, ``SignedProportion``, + ``AbsoluteProportion``, ``SquaredDifference``}. + + :param martingale: The type of martingale betting function used for + computing the martingale score. Available options are {``Power``, + ``Mixture``}. + + :param power_martingale_epsilon: The epsilon parameter for the Power + martingale if martingale is set to ``Power``. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:func:`IIDChangePointDetector + `, + :py:func:`IIDSpikeDetector + `, + :py:func:`SsaSpikeDetector + `. + + .. index:: models, timeseries, transform + + Example: + .. literalinclude:: /../nimbusml/examples/SsaChangePointDetector.py + :language: python + """ + + @trace + def __init__( + self, + training_window_size=100, + confidence=95.0, + seasonal_window_size=10, + change_history_length=20, + error_function='SignedDifference', + martingale='Power', + power_martingale_epsilon=0.1, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.training_window_size = training_window_size + self.confidence = confidence + self.seasonal_window_size = seasonal_window_size + self.change_history_length = change_history_length + self.error_function = error_function + self.martingale = martingale + self.power_martingale_epsilon = power_martingale_epsilon + + @property + def _entrypoint(self): + return timeseriesprocessingentrypoints_ssachangepointdetector + + @trace + def _get_node(self, **all_args): + algo_args = dict( + source=self.source, + name=self._name_or_source, + training_window_size=self.training_window_size, + confidence=self.confidence, + seasonal_window_size=self.seasonal_window_size, + change_history_length=self.change_history_length, + error_function=self.error_function, + martingale=self.martingale, + power_martingale_epsilon=self.power_martingale_epsilon) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/timeseries/_ssaspikedetector.py b/src/python/nimbusml/internal/core/timeseries/_ssaspikedetector.py new file mode 100644 index 00000000..6a1097f8 --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/_ssaspikedetector.py @@ -0,0 +1,129 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +SsaSpikeDetector +""" + +__all__ = ["SsaSpikeDetector"] + + +from ...entrypoints.timeseriesprocessingentrypoints_ssaspikedetector import \ + timeseriesprocessingentrypoints_ssaspikedetector +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class SsaSpikeDetector(BasePipelineItem, DefaultSignature): + """ + + This transform detects the spikes in a seasonal time-series using + Singular Spectrum Analysis (SSA). + + .. remarks:: + `Singular Spectrum Analysis (SSA) + `_ is a + powerful + framework for decomposing the time-series into trend, seasonality and + noise components as well as forecasting + the future values of the time-series. In order to remove the effect + of such components on anomaly detection, + this transform adds SSA as a time-series modeler component in the + detection pipeline. + + The SSA component will be trained and it predicts the next expected + value on the time-series under normal condition; this expected value + is + further used to calculate the amount of deviation from the normal + (predicted) behavior at that timestamp. + The distribution of this deviation is then modeled using `Adaptive + kernel density estimation + `_. + + The `p-value score for the + current deviation is calculated based on the + estimated distribution. The lower its value, the more likely the + current point is an outlier. + + :param training_window_size: The number of points, N, from the beginning + of the sequence used to train the SSA + model. + + :param confidence: The confidence for spike detection in the range [0, + 100]. + + :param seasonal_window_size: An upper bound, L, on the largest relevant + seasonality in the input time-series, which + also determines the order of the autoregression of SSA. It must + satisfy 2 < L < N/2. + + :param side: The argument that determines whether to detect positive or + negative anomalies, or both. Available + options are {``Positive``, ``Negative``, ``TwoSided``}. + + :param pvalue_history_length: The size of the sliding window for computing + the p-value. + + :param error_function: The function used to compute the error between the + expected and the observed value. Possible + values are {``SignedDifference``, ``AbsoluteDifference``, + ``SignedProportion``, ``AbsoluteProportion``, + ``SquaredDifference``}. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:func:`IIDChangePointDetector + `, + :py:func:`IIDSpikeDetector + `, + :py:func:`SsaChangePointDetector + `. + + .. index:: models, timeseries, transform + + Example: + .. literalinclude:: /../nimbusml/examples/SsaSpikeDetector.py + :language: python + """ + + @trace + def __init__( + self, + training_window_size=100, + confidence=99.0, + seasonal_window_size=10, + side='TwoSided', + pvalue_history_length=100, + error_function='SignedDifference', + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.training_window_size = training_window_size + self.confidence = confidence + self.seasonal_window_size = seasonal_window_size + self.side = side + self.pvalue_history_length = pvalue_history_length + self.error_function = error_function + + @property + def _entrypoint(self): + return timeseriesprocessingentrypoints_ssaspikedetector + + @trace + def _get_node(self, **all_args): + algo_args = dict( + source=self.source, + name=self._name_or_source, + training_window_size=self.training_window_size, + confidence=self.confidence, + seasonal_window_size=self.seasonal_window_size, + side=self.side, + pvalue_history_length=self.pvalue_history_length, + error_function=self.error_function) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/tests/timeseries/__init__.py b/src/python/nimbusml/tests/timeseries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/nimbusml/tests/timeseries/test_iidchangepointdetector.py b/src/python/nimbusml/tests/timeseries/test_iidchangepointdetector.py new file mode 100644 index 00000000..e15863d1 --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_iidchangepointdetector.py @@ -0,0 +1,47 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import IidChangePointDetector + + +class TestIidChangePointDetector(unittest.TestCase): + + def test_correct_data_is_marked_as_change_point(self): + input_data = [5, 5, 5, 5, 5, 5, 5, 5] + input_data.extend([7, 7, 7, 7, 7, 7, 7, 7]) + X_train = pd.Series(input_data, name="ts") + + cpd = IidChangePointDetector(confidence=95, change_history_length=4) << {'result': 'ts'} + data = cpd.fit_transform(X_train) + + self.assertEqual(data.loc[8, 'result.Alert'], 1.0) + + data = data.loc[data['result.Alert'] == 1.0] + self.assertEqual(len(data), 1) + + def test_multiple_user_specified_columns_is_not_allowed(self): + path = get_dataset('timeseries').as_filepath() + data = FileDataStream.read_csv(path) + + try: + pipeline = Pipeline([ + IidChangePointDetector(columns=['t2', 't3'], change_history_length=5) + ]) + pipeline.fit_transform(data) + + except RuntimeError as e: + self.assertTrue('Only one column is allowed' in str(e)) + return + + self.fail() + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/timeseries/test_iidspikedetector.py b/src/python/nimbusml/tests/timeseries/test_iidspikedetector.py new file mode 100644 index 00000000..61a105f8 --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_iidspikedetector.py @@ -0,0 +1,62 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import IidSpikeDetector +from nimbusml.preprocessing.schema import TypeConverter + + +class TestIidSpikeDetector(unittest.TestCase): + + def test_correct_data_is_marked_as_anomaly(self): + X_train = pd.Series([5, 5, 5, 5, 5, 10, 5, 5, 5, 5, 5], name="ts") + isd = IidSpikeDetector(confidence=95, pvalue_history_length=3) << {'result': 'ts'} + data = isd.fit_transform(X_train) + + data = data.loc[data['result.Alert'] == 1.0] + self.assertEqual(len(data), 1) + self.assertEqual(data.iloc[0]['ts'], 10.0) + + def test_multiple_user_specified_columns_is_not_allowed(self): + path = get_dataset('timeseries').as_filepath() + data = FileDataStream.read_csv(path) + + try: + pipeline = Pipeline([ + IidSpikeDetector(columns=['t2', 't3'], pvalue_history_length=5) + ]) + pipeline.fit_transform(data) + + except RuntimeError as e: + self.assertTrue('Only one column is allowed' in str(e)) + return + + self.fail() + + def test_pre_transform_does_not_convert_non_time_series_columns(self): + X_train = pd.DataFrame({ + 'Date': ['2017-01', '2017-02', '2017-03'], + 'Values': [5.0, 5.0, 5.0]}) + + self.assertEqual(len(X_train.dtypes), 2) + self.assertEqual(str(X_train.dtypes[0]), 'object') + self.assertTrue(str(X_train.dtypes[1]).startswith('float')) + + isd = IidSpikeDetector(confidence=95, pvalue_history_length=3) << 'Values' + data = isd.fit_transform(X_train) + + self.assertEqual(len(data.dtypes), 4) + self.assertEqual(str(data.dtypes[0]), 'object') + self.assertTrue(str(data.dtypes[1]).startswith('float')) + self.assertTrue(str(data.dtypes[2]).startswith('float')) + self.assertTrue(str(data.dtypes[3]).startswith('float')) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/timeseries/test_ssachangepointdetector.py b/src/python/nimbusml/tests/timeseries/test_ssachangepointdetector.py new file mode 100644 index 00000000..d3ad27ef --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_ssachangepointdetector.py @@ -0,0 +1,61 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import SsaChangePointDetector + + +class TestSsaChangePointDetector(unittest.TestCase): + + def test_correct_data_is_marked_as_change_point(self): + seasonality_size = 5 + seasonal_data = np.arange(seasonality_size) + + data = np.tile(seasonal_data, 3) + data = np.append(data, [0, 100, 200, 300, 400]) # change distribution + + X_train = pd.Series(data, name="ts") + + training_seasons = 3 + training_size = seasonality_size * training_seasons + + cpd = SsaChangePointDetector(confidence=95, + change_history_length=8, + training_window_size=training_size, + seasonal_window_size=seasonality_size + 1) << {'result': 'ts'} + + cpd.fit(X_train, verbose=1) + data = cpd.transform(X_train) + + + self.assertEqual(data.loc[16, 'result.Alert'], 1.0) + + data = data.loc[data['result.Alert'] == 1.0] + self.assertEqual(len(data), 1) + + def test_multiple_user_specified_columns_is_not_allowed(self): + path = get_dataset('timeseries').as_filepath() + data = FileDataStream.read_csv(path) + + try: + pipeline = Pipeline([ + SsaChangePointDetector(columns=['t2', 't3'], change_history_length=5) + ]) + pipeline.fit_transform(data) + + except RuntimeError as e: + self.assertTrue('Only one column is allowed' in str(e)) + return + + self.fail() + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/timeseries/test_ssaspikedetector.py b/src/python/nimbusml/tests/timeseries/test_ssaspikedetector.py new file mode 100644 index 00000000..74610661 --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_ssaspikedetector.py @@ -0,0 +1,60 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.timeseries import SsaSpikeDetector + + +class TestSsaSpikeDetector(unittest.TestCase): + + def test_correct_data_is_marked_as_spike(self): + seasonality_size = 5 + seasonal_data = np.arange(seasonality_size) + + data = np.tile(seasonal_data, 3) + data = np.append(data, [100]) # add a spike + data = np.append(data, seasonal_data) + + X_train = pd.Series(data, name="ts") + training_seasons = 3 + training_size = seasonality_size * training_seasons + + ssd = SsaSpikeDetector(confidence=95, + pvalue_history_length=8, + training_window_size=training_size, + seasonal_window_size=seasonality_size + 1) << {'result': 'ts'} + + ssd.fit(X_train) + data = ssd.transform(X_train) + + self.assertEqual(data.loc[15, 'result.Alert'], 1.0) + + data = data.loc[data['result.Alert'] == 1.0] + self.assertEqual(len(data), 1) + + def test_multiple_user_specified_columns_is_not_allowed(self): + path = get_dataset('timeseries').as_filepath() + data = FileDataStream.read_csv(path) + + try: + pipeline = Pipeline([ + SsaSpikeDetector(columns=['t2', 't3'], pvalue_history_length=5) + ]) + pipeline.fit_transform(data) + + except RuntimeError as e: + self.assertTrue('Only one column is allowed' in str(e)) + return + + self.fail() + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/timeseries/__init__.py b/src/python/nimbusml/timeseries/__init__.py new file mode 100644 index 00000000..13db4520 --- /dev/null +++ b/src/python/nimbusml/timeseries/__init__.py @@ -0,0 +1,11 @@ +from ._iidspikedetector import IidSpikeDetector +from ._iidchangepointdetector import IidChangePointDetector +from ._ssaspikedetector import SsaSpikeDetector +from ._ssachangepointdetector import SsaChangePointDetector + +__all__ = [ + 'IidSpikeDetector', + 'IidChangePointDetector', + 'SsaSpikeDetector', + 'SsaChangePointDetector' +] diff --git a/src/python/nimbusml/timeseries/_iidchangepointdetector.py b/src/python/nimbusml/timeseries/_iidchangepointdetector.py new file mode 100644 index 00000000..0df53ba7 --- /dev/null +++ b/src/python/nimbusml/timeseries/_iidchangepointdetector.py @@ -0,0 +1,119 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +IidChangePointDetector +""" + +__all__ = ["IidChangePointDetector"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries._iidchangepointdetector import \ + IidChangePointDetector as core +from ..internal.utils.utils import trace + + +class IidChangePointDetector( + core, + BaseTransform, + TransformerMixin): + """ + + This transform detects the change-points in an i.i.d. sequence using + adaptive kernel density estimation and martingales. + + .. remarks:: + ``IIDChangePointDetector`` assumes a sequence of data points that are + independently sampled from one + stationary distribution. `Adaptive kernel density estimation + `_ + is used to model the distribution. + + This transform detects + change points by calculating the martingale score for the sliding + window based on the estimated distribution. + The idea is based on the `Exchangeability + Martingales `_ that + detects a change of distribution over a stream of i.i.d. values. In + short, the value of the + martingale score starts increasing significantly when a sequence of + small p-values are detected in a row; this + indicates the change of the distribution of the underlying data + generation process. + + :param columns: see `Columns `_. + + :param confidence: The confidence for change point detection in the range + [0, 100]. Used to set the threshold of the martingale score for + triggering alert. + + :param change_history_length: The length of the sliding window on p-value + for computing the martingale score. + + :param martingale: The type of martingale betting function used for + computing the martingale score. Available options are {``Power``, + ``Mixture``}. + + :param power_martingale_epsilon: The epsilon parameter for the Power + martingale if martingale is set to ``Power``. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:func:`IIDSpikeDetector + `, + :py:func:`SsaSpikeDetector + `, + :py:func:`SsaChangePointDetector + `. + + .. index:: models, timeseries, transform + + Example: + .. literalinclude:: + /../nimbusml/examples/IidSpikeChangePointDetector.py + :language: python + """ + + @trace + def __init__( + self, + confidence=95.0, + change_history_length=20, + martingale='Power', + power_martingale_epsilon=0.1, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + confidence=confidence, + change_history_length=change_history_length, + martingale=martingale, + power_martingale_epsilon=power_martingale_epsilon, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..preprocessing.schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/timeseries/_iidspikedetector.py b/src/python/nimbusml/timeseries/_iidspikedetector.py new file mode 100644 index 00000000..51582ae8 --- /dev/null +++ b/src/python/nimbusml/timeseries/_iidspikedetector.py @@ -0,0 +1,101 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +IidSpikeDetector +""" + +__all__ = ["IidSpikeDetector"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries._iidspikedetector import \ + IidSpikeDetector as core +from ..internal.utils.utils import trace + + +class IidSpikeDetector(core, BaseTransform, TransformerMixin): + """ + + This transform detects the spikes in a i.i.d. sequence using adaptive + kernel density estimation. + + .. remarks:: + ``IIDSpikeDetector`` assumes a sequence of data points that are + independently sampled from one stationary + distribution. `Adaptive kernel density estimation + `_ + is used to model the distribution. + The `p-value score + indicates the likelihood of the current observation according to + the estimated distribution. The lower its value, the more likely the + current point is an outlier. + + :param columns: see `Columns `_. + + :param confidence: The confidence for spike detection in the range [0, + 100]. + + :param side: The argument that determines whether to detect positive or + negative anomalies, or both. Available options are {``Positive``, + ``Negative``, ``TwoSided``}. + + :param pvalue_history_length: The size of the sliding window for computing + the p-value. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:func:`IIDChangePointDetector + `, + :py:func:`SsaSpikeDetector + `, + :py:func:`SsaChangePointDetector + `. + + .. index:: models, timeseries, transform + + Example: + .. literalinclude:: /../nimbusml/examples/IidSpikePointDetector.py + :language: python + """ + + @trace + def __init__( + self, + confidence=99.0, + side='TwoSided', + pvalue_history_length=100, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + confidence=confidence, + side=side, + pvalue_history_length=pvalue_history_length, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..preprocessing.schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/timeseries/_ssachangepointdetector.py b/src/python/nimbusml/timeseries/_ssachangepointdetector.py new file mode 100644 index 00000000..3b02d49e --- /dev/null +++ b/src/python/nimbusml/timeseries/_ssachangepointdetector.py @@ -0,0 +1,147 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +SsaChangePointDetector +""" + +__all__ = ["SsaChangePointDetector"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries._ssachangepointdetector import \ + SsaChangePointDetector as core +from ..internal.utils.utils import trace + + +class SsaChangePointDetector( + core, + BaseTransform, + TransformerMixin): + """ + + This transform detects the change-points in a seasonal time-series + using Singular Spectrum Analysis (SSA). + + .. remarks:: + `Singular Spectrum Analysis (SSA) + `_ is a + powerful framework for decomposing the time-series into trend, + seasonality and noise components as well as forecasting the future + values of the time-series. In order to remove the + effect of such components on anomaly detection, this transform add + SSA as a time-series modeler component in the detection pipeline. + + The SSA component will be trained and it predicts the next expected + value on the time-series under normal condition; this expected value + is + further used to calculate the amount of deviation from the normal + behavior at that timestamp. + The distribution of this deviation is then modeled using `Adaptive + kernel density estimation + `_. + + This transform detects + change points by calculating the martingale score for the sliding + window based on the estimated distribution of deviations. + The idea is based on the `Exchangeability + Martingales `_ that + detects a change of distribution over a stream of i.i.d. values. In + short, the value of the + martingale score starts increasing significantly when a sequence of + small p-values detected in a row; this + indicates the change of the distribution of the underlying data + generation process. + + :param columns: see `Columns `_. + + :param training_window_size: The number of points, N, from the beginning + of the sequence used to train the SSA model. + + :param confidence: The confidence for change point detection in the range + [0, 100]. + + :param seasonal_window_size: An upper bound, L, on the largest relevant + seasonality in the input time-series, which also + determines the order of the autoregression of SSA. It must satisfy 2 + < L < N/2. + + :param change_history_length: The length of the sliding window on p-value + for computing the martingale score. + + :param error_function: The function used to compute the error between the + expected and the observed value. Possible values are: + {``SignedDifference``, ``AbsoluteDifference``, ``SignedProportion``, + ``AbsoluteProportion``, ``SquaredDifference``}. + + :param martingale: The type of martingale betting function used for + computing the martingale score. Available options are {``Power``, + ``Mixture``}. + + :param power_martingale_epsilon: The epsilon parameter for the Power + martingale if martingale is set to ``Power``. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:func:`IIDChangePointDetector + `, + :py:func:`IIDSpikeDetector + `, + :py:func:`SsaSpikeDetector + `. + + .. index:: models, timeseries, transform + + Example: + .. literalinclude:: /../nimbusml/examples/SsaChangePointDetector.py + :language: python + """ + + @trace + def __init__( + self, + training_window_size=100, + confidence=95.0, + seasonal_window_size=10, + change_history_length=20, + error_function='SignedDifference', + martingale='Power', + power_martingale_epsilon=0.1, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + training_window_size=training_window_size, + confidence=confidence, + seasonal_window_size=seasonal_window_size, + change_history_length=change_history_length, + error_function=error_function, + martingale=martingale, + power_martingale_epsilon=power_martingale_epsilon, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..preprocessing.schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/timeseries/_ssaspikedetector.py b/src/python/nimbusml/timeseries/_ssaspikedetector.py new file mode 100644 index 00000000..ad831a15 --- /dev/null +++ b/src/python/nimbusml/timeseries/_ssaspikedetector.py @@ -0,0 +1,136 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +SsaSpikeDetector +""" + +__all__ = ["SsaSpikeDetector"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries._ssaspikedetector import \ + SsaSpikeDetector as core +from ..internal.utils.utils import trace + + +class SsaSpikeDetector(core, BaseTransform, TransformerMixin): + """ + + This transform detects the spikes in a seasonal time-series using + Singular Spectrum Analysis (SSA). + + .. remarks:: + `Singular Spectrum Analysis (SSA) + `_ is a + powerful + framework for decomposing the time-series into trend, seasonality and + noise components as well as forecasting + the future values of the time-series. In order to remove the effect + of such components on anomaly detection, + this transform adds SSA as a time-series modeler component in the + detection pipeline. + + The SSA component will be trained and it predicts the next expected + value on the time-series under normal condition; this expected value + is + further used to calculate the amount of deviation from the normal + (predicted) behavior at that timestamp. + The distribution of this deviation is then modeled using `Adaptive + kernel density estimation + `_. + + The `p-value score for the + current deviation is calculated based on the + estimated distribution. The lower its value, the more likely the + current point is an outlier. + + :param columns: see `Columns `_. + + :param training_window_size: The number of points, N, from the beginning + of the sequence used to train the SSA + model. + + :param confidence: The confidence for spike detection in the range [0, + 100]. + + :param seasonal_window_size: An upper bound, L, on the largest relevant + seasonality in the input time-series, which + also determines the order of the autoregression of SSA. It must + satisfy 2 < L < N/2. + + :param side: The argument that determines whether to detect positive or + negative anomalies, or both. Available + options are {``Positive``, ``Negative``, ``TwoSided``}. + + :param pvalue_history_length: The size of the sliding window for computing + the p-value. + + :param error_function: The function used to compute the error between the + expected and the observed value. Possible + values are {``SignedDifference``, ``AbsoluteDifference``, + ``SignedProportion``, ``AbsoluteProportion``, + ``SquaredDifference``}. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:func:`IIDChangePointDetector + `, + :py:func:`IIDSpikeDetector + `, + :py:func:`SsaChangePointDetector + `. + + .. index:: models, timeseries, transform + + Example: + .. literalinclude:: /../nimbusml/examples/SsaSpikeDetector.py + :language: python + """ + + @trace + def __init__( + self, + training_window_size=100, + confidence=99.0, + seasonal_window_size=10, + side='TwoSided', + pvalue_history_length=100, + error_function='SignedDifference', + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + training_window_size=training_window_size, + confidence=confidence, + seasonal_window_size=seasonal_window_size, + side=side, + pvalue_history_length=pvalue_history_length, + error_function=error_function, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..preprocessing.schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index 07b1453c..5dac16f5 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -16,6 +16,8 @@ from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram from nimbusml.preprocessing import TensorFlowScorer from nimbusml.preprocessing.filter import SkipFilter, TakeFilter +from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector, + SsaSpikeDetector, SsaChangePointDetector) from sklearn.utils.estimator_checks import _yield_all_checks, MULTI_OUTPUT this = os.path.abspath(os.path.dirname(__file__)) @@ -53,6 +55,13 @@ # fix pending in PR, bug cant handle csr matrix 'RangeFilter': 'check_estimators_dtypes, ' 'check_estimator_sparse_data', + # time series do not currently support sparse matrices + 'IidSpikeDetector': 'check_estimator_sparse_data', + 'IidChangePointDetector': 'check_estimator_sparse_data', + 'SsaSpikeDetector': 'check_estimator_sparse_data' + 'check_fit2d_1sample', # SSA requires more than one sample + 'SsaChangePointDetector': 'check_estimator_sparse_data' + 'check_fit2d_1sample', # SSA requires more than one sample # bug, low tolerance 'FastLinearRegressor': 'check_supervised_y_2d, ' 'check_regressor_data_not_an_array, ' @@ -180,6 +189,10 @@ 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), + 'IidSpikeDetector': IidSpikeDetector(columns=['F0']), + 'IidChangePointDetector': IidChangePointDetector(columns=['F0']), + 'SsaSpikeDetector': SsaSpikeDetector(columns=['F0'], seasonal_window_size=2), + 'SsaChangePointDetector': SsaChangePointDetector(columns=['F0'], seasonal_window_size=2), 'TensorFlowScorer': TensorFlowScorer( model_location=os.path.join( this, diff --git a/src/python/tests_extended/test_docs_example.py b/src/python/tests_extended/test_docs_example.py index 50333cd9..23fb2f82 100644 --- a/src/python/tests_extended/test_docs_example.py +++ b/src/python/tests_extended/test_docs_example.py @@ -70,6 +70,14 @@ def test_examples(self): 'NaiveBayesClassifier_df.py' ]: continue + # skip for ubuntu 14 tests + if platform.linux_distribution()[0] == 'Ubuntu' and platform.linux_distribution()[1][:2] == '14': + if name in [ + # libdl needs to be setup + 'Image.py', + 'Image_df.py' + ]: + continue # skip for centos7 tests if platform.linux_distribution()[0] == 'CentOS Linux': if name in [ diff --git a/src/python/tools/compiler_utils.py b/src/python/tools/compiler_utils.py index d7462c78..9a5e1e07 100644 --- a/src/python/tools/compiler_utils.py +++ b/src/python/tools/compiler_utils.py @@ -120,6 +120,10 @@ def _nodes_with_presteps(self): '''from ..schema import TypeConverter return [TypeConverter(result_type='R4')._steal_io(self), self]''' +timeseries_to_r4_converter = \ + '''from ..preprocessing.schema import TypeConverter +return [TypeConverter(result_type='R4')._steal_io(self), self]''' + _presteps = { 'MinMaxScaler': int_to_r4_converter, 'MeanVarianceScaler': int_to_r4_converter, @@ -127,6 +131,11 @@ def _nodes_with_presteps(self): 'Binner': int_to_r4_converter, # 'SupervisedBinner': int_to_r4_converter, # not exist in nimbusml + 'IidSpikeDetector': timeseries_to_r4_converter, + 'IidChangePointDetector': timeseries_to_r4_converter, + 'SsaSpikeDetector': timeseries_to_r4_converter, + 'SsaChangePointDetector': timeseries_to_r4_converter, + 'PcaTransformer': '''from ..preprocessing.schema import TypeConverter if type(self._columns) == dict: diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index c19aad98..25708e21 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -539,6 +539,30 @@ "Module": "decomposition", "Type": "Anomaly" }, + { + "Name": "TimeSeriesProcessingEntryPoints.IidSpikeDetector", + "NewName": "IidSpikeDetector", + "Module": "timeseries", + "Type": "Transform" + }, + { + "Name": "TimeSeriesProcessingEntryPoints.IidChangePointDetector", + "NewName": "IidChangePointDetector", + "Module": "timeseries", + "Type": "Transform" + }, + { + "Name": "TimeSeriesProcessingEntryPoints.SsaSpikeDetector", + "NewName": "SsaSpikeDetector", + "Module": "timeseries", + "Type": "Transform" + }, + { + "Name": "TimeSeriesProcessingEntryPoints.SsaChangePointDetector", + "NewName": "SsaChangePointDetector", + "Module": "timeseries", + "Type": "Transform" + }, { "Name": "Trainers.PoissonRegressor", "NewName": "PoissonRegressionRegressor",