Enable LinearSvmBinaryClassifier (#180)

najeeb-kazmi · web-flow · commit 08d8abf004e4 · 2019-07-11T13:04:09.000-07:00
* Enable LinearSvmBinaryClassifier, add examples, add test, and update docs

* Add test for predict_proba() and decision_function()
diff --git a/src/python/docs/docstrings/LinearSvmBinaryClassifier.txt b/src/python/docs/docstrings/LinearSvmBinaryClassifier.txt
@@ -0,0 +1,57 @@
+    """
+
+    Linear Support Vector Machine (SVM) Binary Classifier
+
+    .. remarks::
+        Linear SVM implements an algorithm that finds a hyperplane in the
+		feature space for binary classification, by solving an SVM problem.
+		For instance, with feature values *f_0, f_1,..., f_{D-1}*, the
+		prediction is given by determining what side of the hyperplane the
+		point falls into. That is the same as the sign of the feautures'
+		weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*,
+		where *w_0, w_1,..., w_{D-1}* are the weights computed by the
+		algorithm, and *b* is the bias computed by the algorithm.
+
+		This algorithm implemented is the PEGASOS method, which alternates
+		between stochastic gradient descent steps and projection steps,
+		introduced by Shalev-Shwartz, Singer and Srebro.
+
+
+        **Reference**
+    
+            `Wikipedia entry for Support Vector Machine
+            <https://en.wikipedia.org/wiki/Support-vector_machine>`_
+    
+            `Pegasos: Primal Estimated sub-GrAdient SOlver for SVM
+            <https://ttic.uchicago.edu/~shai/papers/ShalevSiSr07.pdf>`_
+    
+		
+    :param normalize: Specifies the type of automatic normalization used:
+
+        * ``"Auto"``: if normalization is needed, it is performed
+          automatically. This is the default choice.
+        * ``"No"``: no normalization is performed.
+        * ``"Yes"``: normalization is performed.
+        * ``"Warn"``: if normalization is needed, a warning
+          message is displayed, but normalization is not performed.
+
+        Normalization rescales disparate data ranges to a standard scale.
+        Feature
+        scaling ensures the distances between data points are proportional
+        and
+        enables various optimization methods such as gradient descent to
+        converge
+        much faster. If normalization is performed, a ``MinMax`` normalizer
+        is
+        used. It normalizes values in an interval [a, b] where ``-1 <= a <=
+        0``
+        and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
+        sparsity by mapping zero to zero.
+
+
+    .. index:: models, classification, svm
+
+    Example:
+       .. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py
+               :language: python
+    """
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
@@ -89,6 +89,7 @@
     <Compile Include="nimbusml\examples\examples_from_dataframe\GlobalContrastRowScaler_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\Handler_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\IidChangePointDetector_df.py" />
+    <Compile Include="nimbusml\examples\examples_from_dataframe\LinearSvmBinaryClassifier_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\SsaForecaster_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\SsaChangePointDetector_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\SsaSpikeDetector_df.py" />
@@ -140,6 +141,7 @@
     <Compile Include="nimbusml\examples\Handler.py" />
     <Compile Include="nimbusml\examples\Hinge.py" />
     <Compile Include="nimbusml\examples\IidChangePointDetector.py" />
+    <Compile Include="nimbusml\examples\LinearSvmBinaryClassifier.py" />
     <Compile Include="nimbusml\examples\SsaForecaster.py" />
     <Compile Include="nimbusml\examples\SsaChangePointDetector.py" />
     <Compile Include="nimbusml\examples\SsaSpikeDetector.py" />
@@ -489,6 +491,7 @@
     <Compile Include="nimbusml\internal\core\linear_model\averagedperceptronbinaryclassifier.py" />
     <Compile Include="nimbusml\internal\core\linear_model\fastlinearbinaryclassifier.py" />
     <Compile Include="nimbusml\internal\core\linear_model\fastlinearregressor.py" />
+    <Compile Include="nimbusml\internal\core\linear_model\linearsvmbinaryclassifier.py" />
     <Compile Include="nimbusml\internal\core\linear_model\logisticregressionbinaryclassifier.py" />
     <Compile Include="nimbusml\internal\core\linear_model\logisticregressionclassifier.py" />
     <Compile Include="nimbusml\internal\core\linear_model\sgdbinaryclassifier.py" />
@@ -531,6 +534,7 @@
     <Compile Include="nimbusml\linear_model\averagedperceptronbinaryclassifier.py" />
     <Compile Include="nimbusml\linear_model\fastlinearbinaryclassifier.py" />
     <Compile Include="nimbusml\linear_model\fastlinearregressor.py" />
+    <Compile Include="nimbusml\linear_model\linearsvmbinaryclassifier.py" />
     <Compile Include="nimbusml\linear_model\logisticregressionbinaryclassifier.py" />
     <Compile Include="nimbusml\linear_model\logisticregressionclassifier.py" />
     <Compile Include="nimbusml\linear_model\sgdbinaryclassifier.py" />
@@ -587,6 +591,7 @@
     <Compile Include="nimbusml\tests\ensemble\__init__.py" />
     <Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
     <Compile Include="nimbusml\tests\idv\__init__.py" />
+    <Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />
     <Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
     <Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
     <Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />
@@ -860,6 +865,7 @@
     <Content Include="docs\docstrings\LightGbmRegressor.txt" />
     <Content Include="docs\docstrings\LightLda.txt" />
     <Content Include="docs\docstrings\LinearKernel.txt" />
+    <Content Include="docs\docstrings\LinearSvmBinaryClassifier.txt" />
     <Content Include="docs\docstrings\Loader.txt" />
     <Content Include="docs\docstrings\LocalDeepSvmBinaryClassifier.txt" />
     <Content Include="docs\docstrings\LogisticRegressionBinaryClassifier.txt" />
diff --git a/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py b/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py
@@ -0,0 +1,37 @@
+###############################################################################
+# LinearSvmBinaryClassifier
+from nimbusml import Pipeline, FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.linear_model import LinearSvmBinaryClassifier
+
+# data input (as a FileDataStream)
+path = get_dataset('infert').as_filepath()
+
+data = FileDataStream.read_csv(path)
+print(data.head())
+#   age  case education  induced  parity   ... row_num  spontaneous  ...
+# 0   26     1    0-5yrs        1       6  ...       1            2  ...
+# 1   42     1    0-5yrs        1       1  ...       2            0  ...
+# 2   39     1    0-5yrs        2       6  ...       3            0  ...
+# 3   34     1    0-5yrs        2       4  ...       4            0  ...
+# 4   35     1   6-11yrs        1       3  ...       5            1  ...
+# define the training pipeline
+pipeline = Pipeline([LinearSvmBinaryClassifier(
+    feature=['age', 'parity', 'spontaneous'], label='case')])
+
+# train, predict, and evaluate
+# TODO: Replace with CV
+metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
+
+# print predictions
+print(predictions.head())
+#    PredictedLabel     Score  Probability
+# 0               1  0.688481     0.607060
+# 1               0 -2.514992     0.203312
+# 2               0 -3.479344     0.129230
+# 3               0 -3.016621     0.161422
+# 4               0 -0.825512     0.397461
+# print evaluation metrics
+print(metrics)
+#         AUC  Accuracy  Positive precision  Positive recall  ...
+# 0  0.705476   0.71371            0.666667         0.289157  ...
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/LinearSvmBinaryClassifier_df.py b/src/python/nimbusml/examples/examples_from_dataframe/LinearSvmBinaryClassifier_df.py
@@ -0,0 +1,31 @@
+###############################################################################
+# AveragedPerceptronBinaryClassifier
+import numpy as np
+from nimbusml.datasets import get_dataset
+from nimbusml.feature_extraction.categorical import OneHotVectorizer
+from nimbusml.linear_model import LinearSvmBinaryClassifier
+from sklearn.model_selection import train_test_split
+
+# use the built-in data set 'infert' to create test and train data
+#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  \
+# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0
+# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
+#   pooled.stratum education_str
+# 0             3.0        0-5yrs
+# 1             1.0        0-5yrs
+np.random.seed(0)
+
+df = get_dataset("infert").as_df()
+
+# remove : and ' ' from column names, and encode categorical column
+df.columns = [i.replace(': ', '') for i in df.columns]
+df = (OneHotVectorizer() << 'education_str').fit_transform(df)
+
+X_train, X_test, y_train, y_test = \
+    train_test_split(df.loc[:, df.columns != 'case'], df['case'])
+
+lr = LinearSvmBinaryClassifier().fit(X_train, y_train)
+scores = lr.predict(X_test)
+
+# Evaluate the model
+print('Accuracy:', np.mean(y_test == [i for i in scores]))
diff --git a/src/python/nimbusml/internal/core/linear_model/linearsvmbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/linearsvmbinaryclassifier.py
@@ -0,0 +1,161 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+LinearSvmBinaryClassifier
+"""
+
+__all__ = ["LinearSvmBinaryClassifier"]
+
+
+from ...entrypoints.trainers_linearsvmbinaryclassifier import \
+    trainers_linearsvmbinaryclassifier
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles
+
+
+class LinearSvmBinaryClassifier(
+        BasePipelineItem,
+        DefaultSignatureWithRoles):
+    """
+
+    Linear Support Vector Machine (SVM) Binary Classifier
+
+    .. remarks::
+        Linear SVM implements an algorithm that finds a hyperplane in the
+        feature space for binary classification, by solving an SVM problem.
+        For instance, with feature values *f_0, f_1,..., f_{D-1}*, the
+        prediction is given by determining what side of the hyperplane the
+        point falls into. That is the same as the sign of the feautures'
+        weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*,
+        where *w_0, w_1,..., w_{D-1}* are the weights computed by the
+        algorithm, and *b* is the bias computed by the algorithm.
+
+        This algorithm implemented is the PEGASOS method, which alternates
+        between stochastic gradient descent steps and projection steps,
+        introduced by Shalev-Shwartz, Singer and Srebro.
+
+
+        **Reference**
+
+            `Wikipedia entry for Support Vector Machine
+            <https://en.wikipedia.org/wiki/Support-vector_machine>`_
+
+            `Pegasos: Primal Estimated sub-GrAdient SOlver for SVM
+            <https://ttic.uchicago.edu/~shai/papers/ShalevSiSr07.pdf>`_
+
+
+    :param normalize: Specifies the type of automatic normalization used:
+
+        * ``"Auto"``: if normalization is needed, it is performed
+          automatically. This is the default choice.
+        * ``"No"``: no normalization is performed.
+        * ``"Yes"``: normalization is performed.
+        * ``"Warn"``: if normalization is needed, a warning
+          message is displayed, but normalization is not performed.
+
+        Normalization rescales disparate data ranges to a standard scale.
+        Feature
+        scaling ensures the distances between data points are proportional
+        and
+        enables various optimization methods such as gradient descent to
+        converge
+        much faster. If normalization is performed, a ``MinMax`` normalizer
+        is
+        used. It normalizes values in an interval [a, b] where ``-1 <= a <=
+        0``
+        and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
+        sparsity by mapping zero to zero.
+
+    :param caching: Whether trainer should cache input training data.
+
+    :param lambda_: Regularizer constant.
+
+    :param perform_projection: Perform projection to unit-ball? Typically used
+        with batch size > 1.
+
+    :param number_of_iterations: Number of iterations.
+
+    :param initial_weights_diameter: Sets the initial weights diameter that
+        specifies the range from which values are drawn for the initial
+        weights. These weights are initialized randomly from within this range.
+        For example, if the diameter is specified to be ``d``, then the weights
+        are uniformly distributed between ``-d/2`` and ``d/2``. The default
+        value is ``0``, which specifies that all the  weights are set to zero.
+
+    :param no_bias: No bias.
+
+    :param initial_weights: Initial Weights and bias, comma-separated.
+
+    :param shuffle: Whether to shuffle for each training iteration.
+
+    :param batch_size: Batch size.
+
+    :param params: Additional arguments sent to compute engine.
+
+    .. index:: models, classification, svm
+
+    Example:
+       .. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py
+               :language: python
+    """
+
+    @trace
+    def __init__(
+            self,
+            normalize='Auto',
+            caching='Auto',
+            lambda_=0.001,
+            perform_projection=False,
+            number_of_iterations=1,
+            initial_weights_diameter=0.0,
+            no_bias=False,
+            initial_weights=None,
+            shuffle=True,
+            batch_size=1,
+            **params):
+        BasePipelineItem.__init__(
+            self, type='classifier', **params)
+
+        self.normalize = normalize
+        self.caching = caching
+        self.lambda_ = lambda_
+        self.perform_projection = perform_projection
+        self.number_of_iterations = number_of_iterations
+        self.initial_weights_diameter = initial_weights_diameter
+        self.no_bias = no_bias
+        self.initial_weights = initial_weights
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+
+    @property
+    def _entrypoint(self):
+        return trainers_linearsvmbinaryclassifier
+
+    @trace
+    def _get_node(self, **all_args):
+        algo_args = dict(
+            feature_column_name=self._getattr_role(
+                'feature_column_name',
+                all_args),
+            label_column_name=self._getattr_role(
+                'label_column_name',
+                all_args),
+            example_weight_column_name=self._getattr_role(
+                'example_weight_column_name',
+                all_args),
+            normalize_features=self.normalize,
+            caching=self.caching,
+            lambda_=self.lambda_,
+            perform_projection=self.perform_projection,
+            number_of_iterations=self.number_of_iterations,
+            initial_weights_diameter=self.initial_weights_diameter,
+            no_bias=self.no_bias,
+            initial_weights=self.initial_weights,
+            shuffle=self.shuffle,
+            batch_size=self.batch_size)
+
+        all_args.update(algo_args)
+        return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/linear_model/__init__.py b/src/python/nimbusml/linear_model/__init__.py
@@ -3,6 +3,7 @@
 from .fastlinearbinaryclassifier import FastLinearBinaryClassifier
 from .fastlinearclassifier import FastLinearClassifier
 from .fastlinearregressor import FastLinearRegressor
+from .linearsvmbinaryclassifier import LinearSvmBinaryClassifier
 from .logisticregressionbinaryclassifier import \
     LogisticRegressionBinaryClassifier
 from .logisticregressionclassifier import LogisticRegressionClassifier
@@ -17,6 +18,7 @@
     'FastLinearBinaryClassifier',
     'FastLinearClassifier',
     'FastLinearRegressor',
+    'LinearSvmBinaryClassifier',
     'LogisticRegressionBinaryClassifier',
     'LogisticRegressionClassifier',
     'OnlineGradientDescentRegressor',
diff --git a/src/python/nimbusml/linear_model/linearsvmbinaryclassifier.py b/src/python/nimbusml/linear_model/linearsvmbinaryclassifier.py
diff --git a/src/python/nimbusml/tests/linear_model/test_linearsvmbinaryclassifier.py b/src/python/nimbusml/tests/linear_model/test_linearsvmbinaryclassifier.py
diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json