Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Commit 08d8abf

Browse files
authored
Enable LinearSvmBinaryClassifier (#180)
* Enable LinearSvmBinaryClassifier, add examples, add test, and update docs * Add test for predict_proba() and decision_function()
1 parent 8da13e7 commit 08d8abf

File tree

9 files changed

+549
-0
lines changed

9 files changed

+549
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""
2+
3+
Linear Support Vector Machine (SVM) Binary Classifier
4+
5+
.. remarks::
6+
Linear SVM implements an algorithm that finds a hyperplane in the
7+
feature space for binary classification, by solving an SVM problem.
8+
For instance, with feature values *f_0, f_1,..., f_{D-1}*, the
9+
prediction is given by determining what side of the hyperplane the
10+
point falls into. That is the same as the sign of the feautures'
11+
weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*,
12+
where *w_0, w_1,..., w_{D-1}* are the weights computed by the
13+
algorithm, and *b* is the bias computed by the algorithm.
14+
15+
This algorithm implemented is the PEGASOS method, which alternates
16+
between stochastic gradient descent steps and projection steps,
17+
introduced by Shalev-Shwartz, Singer and Srebro.
18+
19+
20+
**Reference**
21+
22+
`Wikipedia entry for Support Vector Machine
23+
<https://en.wikipedia.org/wiki/Support-vector_machine>`_
24+
25+
`Pegasos: Primal Estimated sub-GrAdient SOlver for SVM
26+
<https://ttic.uchicago.edu/~shai/papers/ShalevSiSr07.pdf>`_
27+
28+
29+
:param normalize: Specifies the type of automatic normalization used:
30+
31+
* ``"Auto"``: if normalization is needed, it is performed
32+
automatically. This is the default choice.
33+
* ``"No"``: no normalization is performed.
34+
* ``"Yes"``: normalization is performed.
35+
* ``"Warn"``: if normalization is needed, a warning
36+
message is displayed, but normalization is not performed.
37+
38+
Normalization rescales disparate data ranges to a standard scale.
39+
Feature
40+
scaling ensures the distances between data points are proportional
41+
and
42+
enables various optimization methods such as gradient descent to
43+
converge
44+
much faster. If normalization is performed, a ``MinMax`` normalizer
45+
is
46+
used. It normalizes values in an interval [a, b] where ``-1 <= a <=
47+
0``
48+
and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
49+
sparsity by mapping zero to zero.
50+
51+
52+
.. index:: models, classification, svm
53+
54+
Example:
55+
.. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py
56+
:language: python
57+
"""

src/python/nimbusml.pyproj

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989
<Compile Include="nimbusml\examples\examples_from_dataframe\GlobalContrastRowScaler_df.py" />
9090
<Compile Include="nimbusml\examples\examples_from_dataframe\Handler_df.py" />
9191
<Compile Include="nimbusml\examples\examples_from_dataframe\IidChangePointDetector_df.py" />
92+
<Compile Include="nimbusml\examples\examples_from_dataframe\LinearSvmBinaryClassifier_df.py" />
9293
<Compile Include="nimbusml\examples\examples_from_dataframe\SsaForecaster_df.py" />
9394
<Compile Include="nimbusml\examples\examples_from_dataframe\SsaChangePointDetector_df.py" />
9495
<Compile Include="nimbusml\examples\examples_from_dataframe\SsaSpikeDetector_df.py" />
@@ -140,6 +141,7 @@
140141
<Compile Include="nimbusml\examples\Handler.py" />
141142
<Compile Include="nimbusml\examples\Hinge.py" />
142143
<Compile Include="nimbusml\examples\IidChangePointDetector.py" />
144+
<Compile Include="nimbusml\examples\LinearSvmBinaryClassifier.py" />
143145
<Compile Include="nimbusml\examples\SsaForecaster.py" />
144146
<Compile Include="nimbusml\examples\SsaChangePointDetector.py" />
145147
<Compile Include="nimbusml\examples\SsaSpikeDetector.py" />
@@ -489,6 +491,7 @@
489491
<Compile Include="nimbusml\internal\core\linear_model\averagedperceptronbinaryclassifier.py" />
490492
<Compile Include="nimbusml\internal\core\linear_model\fastlinearbinaryclassifier.py" />
491493
<Compile Include="nimbusml\internal\core\linear_model\fastlinearregressor.py" />
494+
<Compile Include="nimbusml\internal\core\linear_model\linearsvmbinaryclassifier.py" />
492495
<Compile Include="nimbusml\internal\core\linear_model\logisticregressionbinaryclassifier.py" />
493496
<Compile Include="nimbusml\internal\core\linear_model\logisticregressionclassifier.py" />
494497
<Compile Include="nimbusml\internal\core\linear_model\sgdbinaryclassifier.py" />
@@ -531,6 +534,7 @@
531534
<Compile Include="nimbusml\linear_model\averagedperceptronbinaryclassifier.py" />
532535
<Compile Include="nimbusml\linear_model\fastlinearbinaryclassifier.py" />
533536
<Compile Include="nimbusml\linear_model\fastlinearregressor.py" />
537+
<Compile Include="nimbusml\linear_model\linearsvmbinaryclassifier.py" />
534538
<Compile Include="nimbusml\linear_model\logisticregressionbinaryclassifier.py" />
535539
<Compile Include="nimbusml\linear_model\logisticregressionclassifier.py" />
536540
<Compile Include="nimbusml\linear_model\sgdbinaryclassifier.py" />
@@ -587,6 +591,7 @@
587591
<Compile Include="nimbusml\tests\ensemble\__init__.py" />
588592
<Compile Include="nimbusml\tests\feature_extraction\text\test_sentiment.py" />
589593
<Compile Include="nimbusml\tests\idv\__init__.py" />
594+
<Compile Include="nimbusml\tests\linear_model\test_linearsvmbinaryclassifier.py" />
590595
<Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
591596
<Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
592597
<Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />
@@ -860,6 +865,7 @@
860865
<Content Include="docs\docstrings\LightGbmRegressor.txt" />
861866
<Content Include="docs\docstrings\LightLda.txt" />
862867
<Content Include="docs\docstrings\LinearKernel.txt" />
868+
<Content Include="docs\docstrings\LinearSvmBinaryClassifier.txt" />
863869
<Content Include="docs\docstrings\Loader.txt" />
864870
<Content Include="docs\docstrings\LocalDeepSvmBinaryClassifier.txt" />
865871
<Content Include="docs\docstrings\LogisticRegressionBinaryClassifier.txt" />
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
###############################################################################
2+
# LinearSvmBinaryClassifier
3+
from nimbusml import Pipeline, FileDataStream
4+
from nimbusml.datasets import get_dataset
5+
from nimbusml.linear_model import LinearSvmBinaryClassifier
6+
7+
# data input (as a FileDataStream)
8+
path = get_dataset('infert').as_filepath()
9+
10+
data = FileDataStream.read_csv(path)
11+
print(data.head())
12+
# age case education induced parity ... row_num spontaneous ...
13+
# 0 26 1 0-5yrs 1 6 ... 1 2 ...
14+
# 1 42 1 0-5yrs 1 1 ... 2 0 ...
15+
# 2 39 1 0-5yrs 2 6 ... 3 0 ...
16+
# 3 34 1 0-5yrs 2 4 ... 4 0 ...
17+
# 4 35 1 6-11yrs 1 3 ... 5 1 ...
18+
# define the training pipeline
19+
pipeline = Pipeline([LinearSvmBinaryClassifier(
20+
feature=['age', 'parity', 'spontaneous'], label='case')])
21+
22+
# train, predict, and evaluate
23+
# TODO: Replace with CV
24+
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)
25+
26+
# print predictions
27+
print(predictions.head())
28+
# PredictedLabel Score Probability
29+
# 0 1 0.688481 0.607060
30+
# 1 0 -2.514992 0.203312
31+
# 2 0 -3.479344 0.129230
32+
# 3 0 -3.016621 0.161422
33+
# 4 0 -0.825512 0.397461
34+
# print evaluation metrics
35+
print(metrics)
36+
# AUC Accuracy Positive precision Positive recall ...
37+
# 0 0.705476 0.71371 0.666667 0.289157 ...
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
###############################################################################
2+
# AveragedPerceptronBinaryClassifier
3+
import numpy as np
4+
from nimbusml.datasets import get_dataset
5+
from nimbusml.feature_extraction.categorical import OneHotVectorizer
6+
from nimbusml.linear_model import LinearSvmBinaryClassifier
7+
from sklearn.model_selection import train_test_split
8+
9+
# use the built-in data set 'infert' to create test and train data
10+
# Unnamed: 0 education age parity induced case spontaneous stratum \
11+
# 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0
12+
# 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0
13+
# pooled.stratum education_str
14+
# 0 3.0 0-5yrs
15+
# 1 1.0 0-5yrs
16+
np.random.seed(0)
17+
18+
df = get_dataset("infert").as_df()
19+
20+
# remove : and ' ' from column names, and encode categorical column
21+
df.columns = [i.replace(': ', '') for i in df.columns]
22+
df = (OneHotVectorizer() << 'education_str').fit_transform(df)
23+
24+
X_train, X_test, y_train, y_test = \
25+
train_test_split(df.loc[:, df.columns != 'case'], df['case'])
26+
27+
lr = LinearSvmBinaryClassifier().fit(X_train, y_train)
28+
scores = lr.predict(X_test)
29+
30+
# Evaluate the model
31+
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# --------------------------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License.
4+
# --------------------------------------------------------------------------------------------
5+
# - Generated by tools/entrypoint_compiler.py: do not edit by hand
6+
"""
7+
LinearSvmBinaryClassifier
8+
"""
9+
10+
__all__ = ["LinearSvmBinaryClassifier"]
11+
12+
13+
from ...entrypoints.trainers_linearsvmbinaryclassifier import \
14+
trainers_linearsvmbinaryclassifier
15+
from ...utils.utils import trace
16+
from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles
17+
18+
19+
class LinearSvmBinaryClassifier(
20+
BasePipelineItem,
21+
DefaultSignatureWithRoles):
22+
"""
23+
24+
Linear Support Vector Machine (SVM) Binary Classifier
25+
26+
.. remarks::
27+
Linear SVM implements an algorithm that finds a hyperplane in the
28+
feature space for binary classification, by solving an SVM problem.
29+
For instance, with feature values *f_0, f_1,..., f_{D-1}*, the
30+
prediction is given by determining what side of the hyperplane the
31+
point falls into. That is the same as the sign of the feautures'
32+
weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*,
33+
where *w_0, w_1,..., w_{D-1}* are the weights computed by the
34+
algorithm, and *b* is the bias computed by the algorithm.
35+
36+
This algorithm implemented is the PEGASOS method, which alternates
37+
between stochastic gradient descent steps and projection steps,
38+
introduced by Shalev-Shwartz, Singer and Srebro.
39+
40+
41+
**Reference**
42+
43+
`Wikipedia entry for Support Vector Machine
44+
<https://en.wikipedia.org/wiki/Support-vector_machine>`_
45+
46+
`Pegasos: Primal Estimated sub-GrAdient SOlver for SVM
47+
<https://ttic.uchicago.edu/~shai/papers/ShalevSiSr07.pdf>`_
48+
49+
50+
:param normalize: Specifies the type of automatic normalization used:
51+
52+
* ``"Auto"``: if normalization is needed, it is performed
53+
automatically. This is the default choice.
54+
* ``"No"``: no normalization is performed.
55+
* ``"Yes"``: normalization is performed.
56+
* ``"Warn"``: if normalization is needed, a warning
57+
message is displayed, but normalization is not performed.
58+
59+
Normalization rescales disparate data ranges to a standard scale.
60+
Feature
61+
scaling ensures the distances between data points are proportional
62+
and
63+
enables various optimization methods such as gradient descent to
64+
converge
65+
much faster. If normalization is performed, a ``MinMax`` normalizer
66+
is
67+
used. It normalizes values in an interval [a, b] where ``-1 <= a <=
68+
0``
69+
and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
70+
sparsity by mapping zero to zero.
71+
72+
:param caching: Whether trainer should cache input training data.
73+
74+
:param lambda_: Regularizer constant.
75+
76+
:param perform_projection: Perform projection to unit-ball? Typically used
77+
with batch size > 1.
78+
79+
:param number_of_iterations: Number of iterations.
80+
81+
:param initial_weights_diameter: Sets the initial weights diameter that
82+
specifies the range from which values are drawn for the initial
83+
weights. These weights are initialized randomly from within this range.
84+
For example, if the diameter is specified to be ``d``, then the weights
85+
are uniformly distributed between ``-d/2`` and ``d/2``. The default
86+
value is ``0``, which specifies that all the weights are set to zero.
87+
88+
:param no_bias: No bias.
89+
90+
:param initial_weights: Initial Weights and bias, comma-separated.
91+
92+
:param shuffle: Whether to shuffle for each training iteration.
93+
94+
:param batch_size: Batch size.
95+
96+
:param params: Additional arguments sent to compute engine.
97+
98+
.. index:: models, classification, svm
99+
100+
Example:
101+
.. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py
102+
:language: python
103+
"""
104+
105+
@trace
106+
def __init__(
107+
self,
108+
normalize='Auto',
109+
caching='Auto',
110+
lambda_=0.001,
111+
perform_projection=False,
112+
number_of_iterations=1,
113+
initial_weights_diameter=0.0,
114+
no_bias=False,
115+
initial_weights=None,
116+
shuffle=True,
117+
batch_size=1,
118+
**params):
119+
BasePipelineItem.__init__(
120+
self, type='classifier', **params)
121+
122+
self.normalize = normalize
123+
self.caching = caching
124+
self.lambda_ = lambda_
125+
self.perform_projection = perform_projection
126+
self.number_of_iterations = number_of_iterations
127+
self.initial_weights_diameter = initial_weights_diameter
128+
self.no_bias = no_bias
129+
self.initial_weights = initial_weights
130+
self.shuffle = shuffle
131+
self.batch_size = batch_size
132+
133+
@property
134+
def _entrypoint(self):
135+
return trainers_linearsvmbinaryclassifier
136+
137+
@trace
138+
def _get_node(self, **all_args):
139+
algo_args = dict(
140+
feature_column_name=self._getattr_role(
141+
'feature_column_name',
142+
all_args),
143+
label_column_name=self._getattr_role(
144+
'label_column_name',
145+
all_args),
146+
example_weight_column_name=self._getattr_role(
147+
'example_weight_column_name',
148+
all_args),
149+
normalize_features=self.normalize,
150+
caching=self.caching,
151+
lambda_=self.lambda_,
152+
perform_projection=self.perform_projection,
153+
number_of_iterations=self.number_of_iterations,
154+
initial_weights_diameter=self.initial_weights_diameter,
155+
no_bias=self.no_bias,
156+
initial_weights=self.initial_weights,
157+
shuffle=self.shuffle,
158+
batch_size=self.batch_size)
159+
160+
all_args.update(algo_args)
161+
return self._entrypoint(**all_args)

src/python/nimbusml/linear_model/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .fastlinearbinaryclassifier import FastLinearBinaryClassifier
44
from .fastlinearclassifier import FastLinearClassifier
55
from .fastlinearregressor import FastLinearRegressor
6+
from .linearsvmbinaryclassifier import LinearSvmBinaryClassifier
67
from .logisticregressionbinaryclassifier import \
78
LogisticRegressionBinaryClassifier
89
from .logisticregressionclassifier import LogisticRegressionClassifier
@@ -17,6 +18,7 @@
1718
'FastLinearBinaryClassifier',
1819
'FastLinearClassifier',
1920
'FastLinearRegressor',
21+
'LinearSvmBinaryClassifier',
2022
'LogisticRegressionBinaryClassifier',
2123
'LogisticRegressionClassifier',
2224
'OnlineGradientDescentRegressor',

0 commit comments

Comments
 (0)