Skip to content
This repository was archived by the owner on Aug 2, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions corefxlab.sln
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.28803.156
# Visual Studio 15
VisualStudioVersion = 15.0.28307.329
MinimumVisualStudioVersion = 10.0.40219.1
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{5E7EB061-B9BC-4DA2-B5E5-859AA7C67695}"
ProjectSection(SolutionItems) = preProject
Expand Down Expand Up @@ -110,6 +110,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Numerics.Experimenta
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Numerics.Experimental.Tests", "tests\System.Numerics.Experimental.Tests\System.Numerics.Experimental.Tests.csproj", "{6411FD4E-0CDF-4478-9192-4411DC932314}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Data.DataFrame", "src\Microsoft.Data\Microsoft.Data.DataFrame.csproj", "{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Data.DataFrame.Tests", "tests\Microsoft.Data.Tests\Microsoft.Data.DataFrame.Tests.csproj", "{485FC567-4AEC-4335-B767-283173F64C42}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -696,6 +700,30 @@ Global
{6411FD4E-0CDF-4478-9192-4411DC932314}.Release|x64.Build.0 = Release|Any CPU
{6411FD4E-0CDF-4478-9192-4411DC932314}.Release|x86.ActiveCfg = Release|Any CPU
{6411FD4E-0CDF-4478-9192-4411DC932314}.Release|x86.Build.0 = Release|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Debug|Any CPU.Build.0 = Debug|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Debug|x64.ActiveCfg = Debug|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Debug|x64.Build.0 = Debug|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Debug|x86.ActiveCfg = Debug|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Debug|x86.Build.0 = Debug|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Release|Any CPU.ActiveCfg = Release|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Release|Any CPU.Build.0 = Release|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Release|x64.ActiveCfg = Release|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Release|x64.Build.0 = Release|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Release|x86.ActiveCfg = Release|Any CPU
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899}.Release|x86.Build.0 = Release|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Debug|Any CPU.Build.0 = Debug|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Debug|x64.ActiveCfg = Debug|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Debug|x64.Build.0 = Debug|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Debug|x86.ActiveCfg = Debug|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Debug|x86.Build.0 = Debug|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Release|Any CPU.ActiveCfg = Release|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Release|Any CPU.Build.0 = Release|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Release|x64.ActiveCfg = Release|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Release|x64.Build.0 = Release|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Release|x86.ActiveCfg = Release|Any CPU
{485FC567-4AEC-4335-B767-283173F64C42}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -749,6 +777,8 @@ Global
{544D4C8C-B5C6-4C3C-9763-E4CB6AF9A90C} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
{CB424147-4ACB-4C35-AB24-8BD27D6AB1B9} = {4B000021-5278-4F2A-B734-DE49F55D4024}
{6411FD4E-0CDF-4478-9192-4411DC932314} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
{AD22AAD4-FCB0-4DB4-BC38-AB6ACD153899} = {4B000021-5278-4F2A-B734-DE49F55D4024}
{485FC567-4AEC-4335-B767-283173F64C42} = {3079E458-D0E6-4F99-8CAB-80011D35C7DA}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {9DD4022C-A010-4A9B-BCC5-171566D4CB17}
Expand Down
41 changes: 41 additions & 0 deletions src/Microsoft.Data/BaseDataFrameColumn.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Text;

namespace Microsoft.Data
{
/// <summary>
/// The base column type. All APIs should have atleast a stub here first
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

atleast is two words.

/// </summary>
public abstract class BaseDataFrameColumn
{
public BaseDataFrameColumn(string name, long length = 0)
{
Length = length;
Name = name;
}

private long _length;
public long Length
{
get => _length;
protected set
{
if (value < 0) throw new ArgumentOutOfRangeException();
_length = value;
}
}

public long NullCount { get; protected set; }

public string Name;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Public fields is a bad practice. Must be replaced by a property.


public virtual object this[long rowIndex] { get { throw new NotImplementedException(); } set { throw new NotImplementedException(); } }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is temporary code, but long term it should be on several lines (using => syntax if it's oneliners)


public virtual object this[long startIndex, int length] { get { throw new NotImplementedException(); } }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how current API of System.Data is designed. It uses boxing in many places. I think that this methods should be removed from the base class and be implemented in DataFrameColumn<T>.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are overridden in DataFrameColumn at the moment?. Do you mean they should occur only in DataFrameColumn and not in the base class? I'm not completely convinced about that yet. At the moment, the DataFrame class has a DataFrameTable which has an IList called columns i.e. the DataFrameTable does not know the real type of the columns it holds => therefore, the approach at the moment is to add APIs on BaseDataFrameColumn and override them in the derived columns. It is possible that this will change down the line as I add more features, but for the moment I think this looks reasonable. Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, let's change it a bit later.

}
}
96 changes: 96 additions & 0 deletions src/Microsoft.Data/DataFrame.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;

namespace Microsoft.Data
{
/// <summary>
/// A DataFrame to support indexing, binary operations, sorting, selection and other APIs. This will eventually also expose an IDataView for ML.NET
/// </summary>
public partial class DataFrame
{
private readonly DataFrameTable _table;
public DataFrame()
{
_table = new DataFrameTable();
}

public long RowCount => _table.RowCount;

public int ColumnCount => _table.ColumnCount;

public IList<string> Columns
{
get
{
var ret = new List<string>(ColumnCount);
for (int i = 0; i < ColumnCount; i++)
{
ret.Add(_table.Column(i).Name);
}
return ret;
}
}

public BaseDataFrameColumn Column(int index) => _table.Column(index);

public void InsertColumn(int columnIndex, BaseDataFrameColumn column) => _table.InsertColumn(columnIndex, column);

public void SetColumn(int columnIndex, BaseDataFrameColumn column) => _table.SetColumn(columnIndex, column);

public void RemoveColumn(int columnIndex) => _table.RemoveColumn(columnIndex);

public void RemoveColumn(string columnName) => _table.RemoveColumn(columnName);

public object this[long rowIndex, int columnIndex]
{
get => _table.Column(columnIndex)[rowIndex];
set => _table.Column(columnIndex)[rowIndex] = value;
}

#region Operators
public IList<object> this[long rowIndex]
{
get
{
return _table.GetRow(rowIndex);
}
//TODO?: set?
}

public object this[string columnName]
{
get
{
int columnIndex = _table.GetColumnIndex(columnName);
if (columnIndex == -1) throw new ArgumentException($"{columnName} does not exist");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should GetColumnIndex throw instead? Or is asking for an unknown column a reasonable thing to do?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not throwing yet because I want to support things like:

df["Int3"] = df["Int1"] + df["Int2"];

where df["Int3"] will be created if it doesn't exist. So, I'm not throwing yet. I don't like returning -1 either, so if I can write the setter (when I get around to it) without the -1, I'll add code to throw then.

return _table.Column(columnIndex); //[0, (int)Math.Min(_table.NumRows, Int32.MaxValue)];
}
}

public IList<IList<object>> Head(int numberOfRows)
{
var ret = new List<IList<object>>();
for (int i= 0; i< numberOfRows; i++)
{
ret.Add(this[i]);
}
return ret;
}

public IList<IList<object>> Tail(int numberOfRows)
{
var ret = new List<IList<object>>();
for (long i = RowCount - numberOfRows; i < RowCount; i++)
{
ret.Add(this[i]);
}
return ret;
}
// TODO: Add strongly typed versions of these APIs
#endregion
}
}
Loading