|
3 | 3 | // See the LICENSE file in the project root for more information. |
4 | 4 |
|
5 | 5 | using System; |
| 6 | +using System.Collections; |
6 | 7 | using System.Collections.Generic; |
| 8 | +using System.Collections.Specialized; |
7 | 9 | using System.Globalization; |
8 | 10 | using System.IO; |
9 | 11 | using System.Linq; |
@@ -49,23 +51,35 @@ internal static IDictionary<string, string> GenerateSampleData(string inputFile, |
49 | 51 |
|
50 | 52 | internal static IDictionary<string, string> GenerateSampleData(IDataView dataView, ColumnInferenceResults columnInference) |
51 | 53 | { |
52 | | - var featureColumns = dataView.Schema.AsEnumerable().Where(col => col.Name != columnInference.ColumnInformation.LabelColumnName && !columnInference.ColumnInformation.IgnoredColumnNames.Contains(col.Name)); |
| 54 | + var featureColumns = dataView.Schema.ToList().FindAll( |
| 55 | + col => col.Name != columnInference.ColumnInformation.LabelColumnName && |
| 56 | + !columnInference.ColumnInformation.IgnoredColumnNames.Contains(col.Name)); |
53 | 57 | var rowCursor = dataView.GetRowCursor(featureColumns); |
54 | 58 |
|
55 | | - var sampleData = featureColumns.Select(column => new { key = Utils.Normalize(column.Name), val = "null" }).ToDictionary(x => x.key, x => x.val); |
| 59 | + OrderedDictionary sampleData = new OrderedDictionary(); |
| 60 | + // Get normalized and unique column names. If there are duplicate column names, the |
| 61 | + // differentiator suffix '_col_x' will be added to each column name, where 'x' is |
| 62 | + // the load order for a given column. |
| 63 | + List<string> normalizedColumnNames= GenerateColumnNames(featureColumns.Select(column => column.Name).ToList()); |
| 64 | + foreach (string columnName in normalizedColumnNames) |
| 65 | + sampleData[columnName] = null; |
56 | 66 | if (rowCursor.MoveNext()) |
57 | 67 | { |
58 | 68 | var getGetGetterMethod = typeof(Utils).GetMethod(nameof(Utils.GetValueFromColumn), BindingFlags.Static | BindingFlags.Public | BindingFlags.NonPublic); |
59 | 69 |
|
60 | | - foreach (var column in featureColumns) |
| 70 | + // Access each feature column name through its index in featureColumns |
| 71 | + // as there may exist duplicate column names. In this case, sampleData |
| 72 | + // column names may have the differentiator suffix of '_col_x' added, |
| 73 | + // which requires access to each column name in through its index. |
| 74 | + for(int i = 0; i < featureColumns.Count(); i++) |
61 | 75 | { |
62 | | - var getGeneraicGetGetterMethod = getGetGetterMethod.MakeGenericMethod(column.Type.RawType); |
63 | | - string val = getGeneraicGetGetterMethod.Invoke(null, new object[] { rowCursor, column }) as string; |
64 | | - sampleData[Utils.Normalize(column.Name)] = val; |
| 76 | + var getGenericGetGetterMethod = getGetGetterMethod.MakeGenericMethod(featureColumns[i].Type.RawType); |
| 77 | + string val = getGenericGetGetterMethod.Invoke(null, new object[] { rowCursor, featureColumns[i] }) as string; |
| 78 | + sampleData[i] = val; |
65 | 79 | } |
66 | 80 | } |
67 | 81 |
|
68 | | - return sampleData; |
| 82 | + return sampleData.Cast<DictionaryEntry>().ToDictionary(k => (string)k.Key, v => (string)v.Value); |
69 | 83 | } |
70 | 84 |
|
71 | 85 | internal static string GetValueFromColumn<T>(DataViewRowCursor rowCursor, DataViewSchema.Column column) |
@@ -247,8 +261,7 @@ internal static int CreateSolutionFile(string solutionFile, string outputPath) |
247 | 261 | internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnInferenceResults, IDictionary<string, CodeGeneratorSettings.ColumnMapping> columnMapping = default) |
248 | 262 | { |
249 | 263 | IList<string> result = new List<string>(); |
250 | | - List<string> normalizedColumnNames = new List<string>(); |
251 | | - bool duplicateColumnNamesExist = false; |
| 264 | + List<string> columnNames = new List<string>(); |
252 | 265 | foreach (var column in columnInferenceResults.TextLoaderOptions.Columns) |
253 | 266 | { |
254 | 267 | StringBuilder sb = new StringBuilder(); |
@@ -284,28 +297,47 @@ internal static IList<string> GenerateClassLabels(ColumnInferenceResults columnI |
284 | 297 | result.Add($"[ColumnName(\"{columnName}\"), LoadColumn({column.Source[0].Min})]"); |
285 | 298 | } |
286 | 299 | sb.Append(" "); |
287 | | - string normalizedColumnName = Utils.Normalize(column.Name); |
288 | | - // Put placeholder for normalized and unique version of column name |
289 | | - if (!duplicateColumnNamesExist && normalizedColumnNames.Contains(normalizedColumnName)) |
290 | | - duplicateColumnNamesExist = true; |
291 | | - normalizedColumnNames.Add(normalizedColumnName); |
| 300 | + columnNames.Add(column.Name); |
292 | 301 | result.Add(sb.ToString()); |
293 | 302 | result.Add("\r\n"); |
294 | 303 | } |
| 304 | + // Get normalized and unique column names. If there are duplicate column names, the |
| 305 | + // differentiator suffix '_col_x' will be added to each column name, where 'x' is |
| 306 | + // the load order for a given column. |
| 307 | + List<string> normalizedColumnNames = GenerateColumnNames(columnNames); |
295 | 308 | for (int i = 1; i < result.Count; i+=3) |
296 | 309 | { |
297 | 310 | // Get normalized column name for correctly typed class property name |
298 | | - // If duplicate column names exist, the only way to ensure all generated column names are unique is to add |
299 | | - // a differentiator depending on the column load order from dataset. |
300 | | - if (duplicateColumnNamesExist) |
301 | | - result[i] += normalizedColumnNames[i/3] + $"_col_{i/3}"; |
302 | | - else |
303 | | - result[i] += normalizedColumnNames[i/3]; |
| 311 | + result[i] += normalizedColumnNames[i/3]; |
304 | 312 | result[i] += "{get; set;}"; |
305 | 313 | } |
306 | 314 | return result; |
307 | 315 | } |
308 | 316 |
|
| 317 | + /// <summary> |
| 318 | + /// Take a list of column names that may not be normalized to fit property name standards |
| 319 | + /// and contain duplicate column names. Return unique and normalized column names. |
| 320 | + /// </summary> |
| 321 | + /// <param name="columnNames">Column names to normalize.</param> |
| 322 | + /// <returns>A list of strings that contain normalized and unique column names.</returns> |
| 323 | + internal static List<string> GenerateColumnNames(List<string> columnNames) |
| 324 | + { |
| 325 | + for (int i = 0; i < columnNames.Count; i++) |
| 326 | + columnNames[i] = Utils.Normalize(columnNames[i]); |
| 327 | + // Check if there are any duplicates in columnNames by obtaining its set |
| 328 | + // and seeing whether or not they are the same size. |
| 329 | + HashSet<String> columnNamesSet = new HashSet<String>(columnNames); |
| 330 | + // If there are duplicates, add the differentiator suffix '_col_x' |
| 331 | + // to each normalized column name, where 'x' is the load |
| 332 | + // order for a given column from dataset. |
| 333 | + if (columnNamesSet.Count != columnNames.Count) |
| 334 | + { |
| 335 | + for (int i = 0; i < columnNames.Count; i++) |
| 336 | + columnNames[i] += String.Concat("_col_", i); |
| 337 | + } |
| 338 | + return columnNames; |
| 339 | + } |
| 340 | + |
309 | 341 | internal static string GetSymbolOfDataKind(DataKind dataKind) |
310 | 342 | { |
311 | 343 | switch (dataKind) |
|
0 commit comments