Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions src/python/docs/docstrings/LinearSvmBinaryClassifier.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""

Linear Support Vector Machine (SVM) Binary Classifier

.. remarks::
Linear SVM implements an algorithm that finds a hyperplane in the
feature space for binary classification, by solving an SVM problem.
For instance, with feature values *f_0, f_1,..., f_{D-1}*, the
prediction is given by determining what side of the hyperplane the
point falls into. That is the same as the sign of the feautures'
weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*,
where *w_0, w_1,..., w_{D-1}* are the weights computed by the
algorithm, and *b* is the bias computed by the algorithm.

This algorithm implemented is the PEGASOS method, which alternates
between stochastic gradient descent steps and projection steps,
introduced by Shalev-Shwartz, Singer and Srebro.


**Reference**

`Wikipedia entry for Support Vector Machine
<https://en.wikipedia.org/wiki/Support-vector_machine>`_

`Pegasos: Primal Estimated sub-GrAdient SOlver for SVM
<https://ttic.uchicago.edu/~shai/papers/ShalevSiSr07.pdf>`_


:param normalize: Specifies the type of automatic normalization used:

* ``"Auto"``: if normalization is needed, it is performed
automatically. This is the default choice.
* ``"No"``: no normalization is performed.
* ``"Yes"``: normalization is performed.
* ``"Warn"``: if normalization is needed, a warning
message is displayed, but normalization is not performed.

Normalization rescales disparate data ranges to a standard scale.
Feature
scaling ensures the distances between data points are proportional
and
enables various optimization methods such as gradient descent to
converge
much faster. If normalization is performed, a ``MinMax`` normalizer
is
used. It normalizes values in an interval [a, b] where ``-1 <= a <=
0``
and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
sparsity by mapping zero to zero.


.. index:: models, classification, svm

Example:
.. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py
:language: python
"""
6 changes: 6 additions & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
<Compile Include="nimbusml\examples\examples_from_dataframe\GlobalContrastRowScaler_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\Handler_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\IidChangePointDetector_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\LinearSvmBinaryClassifier_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\SsaForecaster_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\SsaChangePointDetector_df.py" />
<Compile Include="nimbusml\examples\examples_from_dataframe\SsaSpikeDetector_df.py" />
Expand Down Expand Up @@ -140,6 +141,7 @@
<Compile Include="nimbusml\examples\Handler.py" />
<Compile Include="nimbusml\examples\Hinge.py" />
<Compile Include="nimbusml\examples\IidChangePointDetector.py" />
<Compile Include="nimbusml\examples\LinearSvmBinaryClassifier.py" />
<Compile Include="nimbusml\examples\SsaForecaster.py" />
<Compile Include="nimbusml\examples\SsaChangePointDetector.py" />
<Compile Include="nimbusml\examples\SsaSpikeDetector.py" />
Expand Down Expand Up @@ -489,6 +491,7 @@
<Compile Include="nimbusml\internal\core\linear_model\averagedperceptronbinaryclassifier.py" />
<Compile Include="nimbusml\internal\core\linear_model\fastlinearbinaryclassifier.py" />
<Compile Include="nimbusml\internal\core\linear_model\fastlinearregressor.py" />
<Compile Include="nimbusml\internal\core\linear_model\linearsvmbinaryclassifier.py" />
<Compile Include="nimbusml\internal\core\linear_model\logisticregressionbinaryclassifier.py" />
<Compile Include="nimbusml\internal\core\linear_model\logisticregressionclassifier.py" />
<Compile Include="nimbusml\internal\core\linear_model\sgdbinaryclassifier.py" />
Expand Down Expand Up @@ -531,6 +534,7 @@
<Compile Include="nimbusml\linear_model\averagedperceptronbinaryclassifier.py" />
<Compile Include="nimbusml\linear_model\fastlinearbinaryclassifier.py" />
<Compile Include="nimbusml\linear_model\fastlinearregressor.py" />
<Compile Include="nimbusml\linear_model\linearsvmbinaryclassifier.py" />
<Compile Include="nimbusml\linear_model\logisticregressionbinaryclassifier.py" />
<Compile Include="nimbusml\linear_model\logisticregressionclassifier.py" />
<Compile Include="nimbusml\linear_model\sgdbinaryclassifier.py" />
Expand Down Expand Up @@ -587,6 +591,7 @@
<Compile Include="nimbusml\tests\ensemble\__init__.py" />
<Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
<Compile Include="nimbusml\tests\idv\__init__.py" />
<Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />
<Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
<Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
<Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />
Expand Down Expand Up @@ -860,6 +865,7 @@
<Content Include="docs\docstrings\LightGbmRegressor.txt" />
<Content Include="docs\docstrings\LightLda.txt" />
<Content Include="docs\docstrings\LinearKernel.txt" />
<Content Include="docs\docstrings\LinearSvmBinaryClassifier.txt" />
<Content Include="docs\docstrings\Loader.txt" />
<Content Include="docs\docstrings\LocalDeepSvmBinaryClassifier.txt" />
<Content Include="docs\docstrings\LogisticRegressionBinaryClassifier.txt" />
Expand Down
37 changes: 37 additions & 0 deletions src/python/nimbusml/examples/LinearSvmBinaryClassifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
###############################################################################
# LinearSvmBinaryClassifier
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import LinearSvmBinaryClassifier

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
# age case education induced parity ... row_num spontaneous ...
# 0 26 1 0-5yrs 1 6 ... 1 2 ...
# 1 42 1 0-5yrs 1 1 ... 2 0 ...
# 2 39 1 0-5yrs 2 6 ... 3 0 ...
# 3 34 1 0-5yrs 2 4 ... 4 0 ...
# 4 35 1 6-11yrs 1 3 ... 5 1 ...
# define the training pipeline
pipeline = Pipeline([LinearSvmBinaryClassifier(
feature=['age', 'parity', 'spontaneous'], label='case')])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
# PredictedLabel Score Probability
# 0 1 0.688481 0.607060
# 1 0 -2.514992 0.203312
# 2 0 -3.479344 0.129230
# 3 0 -3.016621 0.161422
# 4 0 -0.825512 0.397461
# print evaluation metrics
print(metrics)
# AUC Accuracy Positive precision Positive recall ...
# 0 0.705476 0.71371 0.666667 0.289157 ...
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
###############################################################################
# AveragedPerceptronBinaryClassifier
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LinearSvmBinaryClassifier
from sklearn.model_selection import train_test_split

# use the built-in data set 'infert' to create test and train data
# Unnamed: 0 education age parity induced case spontaneous stratum \
# 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0
# 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0
# pooled.stratum education_str
# 0 3.0 0-5yrs
# 1 1.0 0-5yrs
np.random.seed(0)

df = get_dataset("infert").as_df()

# remove : and ' ' from column names, and encode categorical column
df.columns = [i.replace(': ', '') for i in df.columns]
df = (OneHotVectorizer() << 'education_str').fit_transform(df)

X_train, X_test, y_train, y_test = \
train_test_split(df.loc[:, df.columns != 'case'], df['case'])

lr = LinearSvmBinaryClassifier().fit(X_train, y_train)
scores = lr.predict(X_test)

# Evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
# - Generated by tools/entrypoint_compiler.py: do not edit by hand
"""
LinearSvmBinaryClassifier
"""

__all__ = ["LinearSvmBinaryClassifier"]


from ...entrypoints.trainers_linearsvmbinaryclassifier import \
trainers_linearsvmbinaryclassifier
from ...utils.utils import trace
from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles


class LinearSvmBinaryClassifier(
BasePipelineItem,
DefaultSignatureWithRoles):
"""

Linear Support Vector Machine (SVM) Binary Classifier

.. remarks::
Linear SVM implements an algorithm that finds a hyperplane in the
feature space for binary classification, by solving an SVM problem.
For instance, with feature values *f_0, f_1,..., f_{D-1}*, the
prediction is given by determining what side of the hyperplane the
point falls into. That is the same as the sign of the feautures'
weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*,
where *w_0, w_1,..., w_{D-1}* are the weights computed by the
algorithm, and *b* is the bias computed by the algorithm.

This algorithm implemented is the PEGASOS method, which alternates
between stochastic gradient descent steps and projection steps,
introduced by Shalev-Shwartz, Singer and Srebro.


**Reference**

`Wikipedia entry for Support Vector Machine
<https://en.wikipedia.org/wiki/Support-vector_machine>`_

`Pegasos: Primal Estimated sub-GrAdient SOlver for SVM
<https://ttic.uchicago.edu/~shai/papers/ShalevSiSr07.pdf>`_


:param normalize: Specifies the type of automatic normalization used:

* ``"Auto"``: if normalization is needed, it is performed
automatically. This is the default choice.
* ``"No"``: no normalization is performed.
* ``"Yes"``: normalization is performed.
* ``"Warn"``: if normalization is needed, a warning
message is displayed, but normalization is not performed.

Normalization rescales disparate data ranges to a standard scale.
Feature
scaling ensures the distances between data points are proportional
and
enables various optimization methods such as gradient descent to
converge
much faster. If normalization is performed, a ``MinMax`` normalizer
is
used. It normalizes values in an interval [a, b] where ``-1 <= a <=
0``
and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
sparsity by mapping zero to zero.

:param caching: Whether trainer should cache input training data.

:param lambda_: Regularizer constant.

:param perform_projection: Perform projection to unit-ball? Typically used
with batch size > 1.

:param number_of_iterations: Number of iterations.

:param initial_weights_diameter: Sets the initial weights diameter that
specifies the range from which values are drawn for the initial
weights. These weights are initialized randomly from within this range.
For example, if the diameter is specified to be ``d``, then the weights
are uniformly distributed between ``-d/2`` and ``d/2``. The default
value is ``0``, which specifies that all the weights are set to zero.

:param no_bias: No bias.

:param initial_weights: Initial Weights and bias, comma-separated.

:param shuffle: Whether to shuffle for each training iteration.

:param batch_size: Batch size.

:param params: Additional arguments sent to compute engine.

.. index:: models, classification, svm

Example:
.. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py
:language: python
"""

@trace
def __init__(
self,
normalize='Auto',
caching='Auto',
lambda_=0.001,
perform_projection=False,
number_of_iterations=1,
initial_weights_diameter=0.0,
no_bias=False,
initial_weights=None,
shuffle=True,
batch_size=1,
**params):
BasePipelineItem.__init__(
self, type='classifier', **params)

self.normalize = normalize
self.caching = caching
self.lambda_ = lambda_
self.perform_projection = perform_projection
self.number_of_iterations = number_of_iterations
self.initial_weights_diameter = initial_weights_diameter
self.no_bias = no_bias
self.initial_weights = initial_weights
self.shuffle = shuffle
self.batch_size = batch_size

@property
def _entrypoint(self):
return trainers_linearsvmbinaryclassifier

@trace
def _get_node(self, **all_args):
algo_args = dict(
feature_column_name=self._getattr_role(
'feature_column_name',
all_args),
label_column_name=self._getattr_role(
'label_column_name',
all_args),
example_weight_column_name=self._getattr_role(
'example_weight_column_name',
all_args),
normalize_features=self.normalize,
caching=self.caching,
lambda_=self.lambda_,
perform_projection=self.perform_projection,
number_of_iterations=self.number_of_iterations,
initial_weights_diameter=self.initial_weights_diameter,
no_bias=self.no_bias,
initial_weights=self.initial_weights,
shuffle=self.shuffle,
batch_size=self.batch_size)

all_args.update(algo_args)
return self._entrypoint(**all_args)
2 changes: 2 additions & 0 deletions src/python/nimbusml/linear_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .fastlinearbinaryclassifier import FastLinearBinaryClassifier
from .fastlinearclassifier import FastLinearClassifier
from .fastlinearregressor import FastLinearRegressor
from .linearsvmbinaryclassifier import LinearSvmBinaryClassifier
from .logisticregressionbinaryclassifier import \
LogisticRegressionBinaryClassifier
from .logisticregressionclassifier import LogisticRegressionClassifier
Expand All @@ -17,6 +18,7 @@
'FastLinearBinaryClassifier',
'FastLinearClassifier',
'FastLinearRegressor',
'LinearSvmBinaryClassifier',
'LogisticRegressionBinaryClassifier',
'LogisticRegressionClassifier',
'OnlineGradientDescentRegressor',
Expand Down
Loading