Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
360 changes: 359 additions & 1 deletion src/DotNetBridge/NativeDataInterop.cs

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions src/DotNetBridge/RunGraph.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ public unsafe static partial class Bridge
// std:null specifier in a graph, used to redirect output to std::null
const string STDNULL = "<null>";

// graph output format specifier, used to output to a sparse csr matrix
const string CSR_MATRIX = "<csr>";

private static void SaveIdvToFile(IDataView idv, string path, IHost host)
{
if (path == STDNULL)
Expand Down Expand Up @@ -183,14 +186,18 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s
throw host.ExceptNotSupp("File handle outputs not yet supported.");
case TlcModule.DataKind.DataView:
var idv = runner.GetOutput<IDataView>(varName);
if (!string.IsNullOrWhiteSpace(path))
if (path == CSR_MATRIX)
{
SendViewToNativeAsCsr(ch, penv, idv);
}
else if (!string.IsNullOrWhiteSpace(path))
{
SaveIdvToFile(idv, path, host);
}
else
{
var infos = ProcessColumns(ref idv, penv->maxSlots, host);
SendViewToNative(ch, penv, idv, infos);
SendViewToNativeAsDataFrame(ch, penv, idv, infos);
}
break;
case TlcModule.DataKind.PredictorModel:
Expand Down
2 changes: 1 addition & 1 deletion src/NativeBridge/ManagedInterop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ void EnvironmentBlock::DataSinkCore(const DataViewBlock * pdata)
throw std::invalid_argument("data type is not supported " + std::to_string(kind));
}

if (pdata->keyCards[i] >= 0)
if (pdata->keyCards && (pdata->keyCards[i] >= 0))
{
_vKeyValues.push_back(new PythonObject<std::string>(TX, pdata->keyCards[i], 1));
_columnToKeyMap.push_back(numKeys++);
Expand Down
1 change: 1 addition & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,7 @@
<Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
<Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
<Compile Include="nimbusml\tests\preprocessing\test_datasettransformer.py" />
<Compile Include="nimbusml\tests\test_csr_matrix_output.py" />
<Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />
<Compile Include="nimbusml\tests\timeseries\test_ssaforecaster.py" />
<Compile Include="nimbusml\tests\timeseries\test_ssachangepointdetector.py" />
Expand Down
10 changes: 5 additions & 5 deletions src/python/nimbusml/internal/utils/data_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def __init__(self, parent, columns):

class BinaryDataStream(DataStream):
"""
Defines a data view.
Data accessor for IDV data format, see here https://github.com/dotnet/machinelearning/blob/master/docs/code/IDataViewImplementation.md
"""

def __init__(self, filename):
Expand All @@ -419,15 +419,15 @@ def to_df(self):
# Do not move these imports or the module fails
# due to circular references.
from ..entrypoints.transforms_nooperation import transforms_nooperation
from .entrypoints import Graph
from .entrypoints import Graph, DataOutputFormat

no_op = transforms_nooperation(
data='$data', output_data='$output_data')
graph_nodes = [no_op]
graph = Graph(
dict(
data=''), dict(
output_data=''), False, *(graph_nodes))
output_data=''), DataOutputFormat.DF, *(graph_nodes))
(out_model, out_data, out_metrics) = graph.run(verbose=True, X=self)
return out_data

Expand All @@ -438,7 +438,7 @@ def head(self, n=5, skip=0):
transforms_rowtakefilter
from ..entrypoints.transforms_rowskipfilter import \
transforms_rowskipfilter
from .entrypoints import Graph
from .entrypoints import Graph, DataOutputFormat
if n == 0:
raise ValueError("n must be > 0")
graph_nodes = []
Expand All @@ -456,7 +456,7 @@ def head(self, n=5, skip=0):
graph = Graph(
dict(
data=''), dict(
output_data=''), False, *(graph_nodes))
output_data=''), DataOutputFormat.DF, *(graph_nodes))
(out_model, out_data, out_metrics) = graph.run(verbose=True, X=self)
return out_data

Expand Down
11 changes: 10 additions & 1 deletion src/python/nimbusml/internal/utils/dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def get_obj(el):
return concat(els, axis=axis, join=join)


def resolve_output(ret):
def resolve_output_as_dataframe(ret):
data = dict()
for key in ret.keys():
if not isinstance(ret[key], dict):
Expand All @@ -212,6 +212,15 @@ def resolve_output(ret):
return DataFrame(data)


def resolve_output_as_csrmatrix(ret):
matrix = csr_matrix((1, 1))

if all([k in ret for k in ('data', 'indices', 'indptr', 'shape')]):
matrix = csr_matrix((ret['data'], ret['indices'], ret['indptr']),
shape=(ret['shape'][0], ret['shape'][1]))
return matrix


# Any changes to this dictionary must also be done in the enum
# ML_PY_TYPE_MAP_ENUM defined in DataViewInterop.h.
_global_dtype_to_char_dict = {
Expand Down
52 changes: 35 additions & 17 deletions src/python/nimbusml/internal/utils/entrypoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from .data_stream import BinaryDataStream
from .data_stream import FileDataStream
from .dataframes import resolve_dataframe, resolve_csr_matrix, pd_concat, \
resolve_output
resolve_output_as_dataframe, resolve_output_as_csrmatrix
from .utils import try_set, set_clr_environment_vars, get_clr_path, \
get_mlnet_path, get_dprep_path
from ..libs.pybridge import px_call
Expand Down Expand Up @@ -141,6 +141,15 @@ def _get_temp_file(suffix=None):
return file_name


class DataOutputFormat(Enum):
Copy link
Collaborator Author

@pieths pieths Sep 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename:
IDV (in docs point to the idv public page for description)
CSR
DF (dataframe)

Use numbers for values.
#Resolved

# Regular pandas dataframe format
DF = 0
# IDV data format, see here https://github.com/dotnet/machinelearning/blob/master/docs/code/IDataViewImplementation.md
IDV = 1
# csr_matrix sparse data format
CSR = 2


class Graph(EntryPoint):
"""
graph
Expand All @@ -166,7 +175,7 @@ def __init__(
self,
inputs=None,
outputs=None,
output_binary_data_stream=False,
data_output_format=DataOutputFormat.DF,
*nodes):
Graph._check_nodes(nodes)

Expand All @@ -191,7 +200,7 @@ def __init__(

self.nodes = nodes
self._write_csv_time = 0
self._output_binary_data_stream = output_binary_data_stream
self._data_output_format = data_output_format

def __iter__(self):
return iter(self.nodes)
Expand Down Expand Up @@ -387,10 +396,13 @@ def remove_multi_level_index(c):
output_metricsfilename = _get_temp_file(suffix='.txt')
self.outputs['output_metrics'] = output_metricsfilename

if 'output_data' in self.outputs and \
self._output_binary_data_stream:
output_idvfilename = _get_temp_file(suffix='.idv')
self.outputs['output_data'] = output_idvfilename
if 'output_data' in self.outputs:
if self._data_output_format == DataOutputFormat.IDV:
output_idvfilename = _get_temp_file(suffix='.idv')
self.outputs['output_data'] = output_idvfilename

elif self._data_output_format == DataOutputFormat.CSR:
self.outputs['output_data'] = "<csr>"

# set graph file for debuggings
if verbose > 0:
Expand Down Expand Up @@ -425,15 +437,21 @@ def remove_multi_level_index(c):
concatenated,
output_modelfilename)

out_data = resolve_output(ret)
# remove label column from data
if out_data is not None and concatenated:
out_columns = list(out_data.columns)
if hasattr(y, 'columns'):
y_column = y.columns[0]
if y_column in out_columns:
out_columns.remove(y_column)
out_data = out_data[out_columns]
out_data = None

if not cv and self._data_output_format == DataOutputFormat.CSR:
out_data = resolve_output_as_csrmatrix(ret)
else:
out_data = resolve_output_as_dataframe(ret)
# remove label column from data
if out_data is not None and concatenated:
out_columns = list(out_data.columns)
if hasattr(y, 'columns'):
y_column = y.columns[0]
if y_column in out_columns:
out_columns.remove(y_column)
out_data = out_data[out_columns]

if output_metricsfilename:
out_metrics = pd.read_csv(
output_metricsfilename,
Expand All @@ -446,7 +464,7 @@ def remove_multi_level_index(c):

if cv:
return self._process_graph_run_results(out_data)
elif self._output_binary_data_stream:
elif self._data_output_format == DataOutputFormat.IDV:
output = BinaryDataStream(output_idvfilename)
return (output_modelfilename, output, out_metrics)
else:
Expand Down
4 changes: 2 additions & 2 deletions src/python/nimbusml/model_selection/cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
transforms_manyheterogeneousmodelcombiner
from ..internal.entrypoints.transforms_modelcombiner import \
transforms_modelcombiner
from ..internal.utils.entrypoints import Graph, GraphOutputType
from ..internal.utils.entrypoints import Graph, GraphOutputType, DataOutputFormat


# Extension method for extending a list of steps, with chaining
Expand Down Expand Up @@ -544,7 +544,7 @@ def fit(
group_column=group_id)

steps.add(cv_node)
graph = Graph(cv_aux_info.inputs, self.outputs, False, *steps)
graph = Graph(cv_aux_info.inputs, self.outputs, DataOutputFormat.DF, *steps)

# prepare telemetry info
class_name = type(self).__name__
Expand Down
34 changes: 27 additions & 7 deletions src/python/nimbusml/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
from .internal.utils.data_schema import DataSchema
from .internal.utils.data_stream import DataStream, ViewDataStream, \
FileDataStream, BinaryDataStream
from .internal.utils.entrypoints import Graph
from .internal.utils.entrypoints import Graph, DataOutputFormat
from .internal.utils.schema_helper import _extract_label_column
from .internal.utils.utils import trace, unlist

Expand Down Expand Up @@ -824,10 +824,18 @@ def _fit_graph(self, X, y, verbose, **params):
if learner_node is None: # last node is transformer
outputs[output_data.replace(
'$', '')] = '' if do_fit_transform else '<null>'

data_output_format = DataOutputFormat.DF
if do_fit_transform:
if output_binary_data_stream:
data_output_format = DataOutputFormat.IDV
elif params.pop('as_csr', False):
data_output_format = DataOutputFormat.CSR

graph = Graph(
inputs,
outputs,
do_fit_transform and output_binary_data_stream,
data_output_format,
*(graph_nodes))

# Checks that every parameter in params was used.
Expand Down Expand Up @@ -1774,10 +1782,13 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0,

outputs = dict(output_data="")

data_output_format = DataOutputFormat.IDV if as_binary_data_stream \
else DataOutputFormat.DF,

graph = Graph(
inputs,
outputs,
as_binary_data_stream,
data_output_format,
*all_nodes)

class_name = type(self).__name__
Expand Down Expand Up @@ -1903,10 +1914,13 @@ def _predict(self, X, y=None,
else:
outputs = dict(output_data="")

data_output_format = DataOutputFormat.IDV if as_binary_data_stream \
else DataOutputFormat.DF,

graph = Graph(
inputs,
outputs,
as_binary_data_stream,
data_output_format,
*all_nodes)

class_name = type(self).__name__
Expand Down Expand Up @@ -2241,10 +2255,16 @@ def transform(

all_nodes.extend([apply_node])

data_output_format = DataOutputFormat.DF
if as_binary_data_stream:
data_output_format = DataOutputFormat.IDV
elif params.pop('as_csr', False):
data_output_format = DataOutputFormat.CSR

graph = Graph(
inputs,
dict(output_data=""),
as_binary_data_stream,
data_output_format,
*all_nodes)

class_name = type(self).__name__
Expand Down Expand Up @@ -2316,7 +2336,7 @@ def summary(self, verbose=0, **params):
graph = Graph(
inputs,
outputs,
False,
DataOutputFormat.DF,
*all_nodes)

class_name = type(self).__name__
Expand Down Expand Up @@ -2566,7 +2586,7 @@ def combine_models(cls, *items, **params):
graph = Graph(
inputs,
outputs,
False,
DataOutputFormat.DF,
*nodes)

class_name = cls.__name__
Expand Down
Loading