From bf4f7e41a3b61cda71243f548dd33c05debe24ad Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Mon, 24 Jun 2019 19:50:44 -0500 Subject: [PATCH 01/12] Created ranking sample --- .../PersonalizedRanking.csproj | 26 ++ .../PersonalizedRanking.sln | 25 ++ .../Common/ConsoleHelper.cs | 62 +++++ .../DataStructures/HotelData.cs | 230 ++++++++++++++++++ .../DataStructures/HotelPrediction.cs | 22 ++ .../DataStructures/HotelRelevance.cs | 10 + .../PersonalizedRanking/Mapper.cs | 34 +++ .../PersonalizedRanking.csproj | 13 + .../PersonalizedRanking/Program.cs | 188 ++++++++++++++ .../Ranking_PersonalizedSort/README.md | 216 ++++++++++++++++ samples/csharp/v1.0.0-All-Samples.sln | 17 +- 11 files changed, 841 insertions(+), 2 deletions(-) create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.csproj create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.sln create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelData.cs create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelRelevance.cs create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Mapper.cs create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/README.md diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.csproj b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.csproj new file mode 100644 index 000000000..5fd7d123e --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.csproj @@ -0,0 +1,26 @@ + + + + Exe + netcoreapp2.2 + + + + + + + + + + + + + + + + + + + + + diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.sln b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.sln new file mode 100644 index 000000000..a63d6b64d --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PersonalizedRanking", "PersonalizedRanking\PersonalizedRanking.csproj", "{F71F24D8-F174-461F-B375-508EFB827A33}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {F71F24D8-F174-461F-B375-508EFB827A33}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F71F24D8-F174-461F-B375-508EFB827A33}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F71F24D8-F174-461F-B375-508EFB827A33}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F71F24D8-F174-461F-B375-508EFB827A33}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {92FA42B0-28BF-4531-B744-F3125DAAC91A} + EndGlobalSection +EndGlobal diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs new file mode 100644 index 000000000..812ce1184 --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs @@ -0,0 +1,62 @@ +using Microsoft.ML; +using Microsoft.ML.Data; +using PersonalizedRanking.DataStructures; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PersonalizedRanking.Common +{ + public class ConsoleHelper + { + // To evaluate the accuracy of the model's predicted rankings, prints out the Discounted Cumulative Gain and Normalized Discounted Cumulative Gain for hotel search queries. + public static void EvaluateMetrics(MLContext mlContext, IDataView scoredData) + { + // Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). + RankingMetrics metrics = mlContext.Ranking.Evaluate(scoredData); + + Console.WriteLine($"DCG: {string.Join(", ", metrics.DiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}"); + + Console.WriteLine($"NDCG: {string.Join(", ", metrics.NormalizedDiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}"); + } + + // Performs evaluation with the truncation level set up to 10 hotel search results within a query. + // This is a temporary workaround for this issue: https://github.com/dotnet/machinelearning/issues/2728. + public static void EvaluateMetrics(MLContext mlContext, IDataView scoredData, int truncationLevel) + { + if (truncationLevel < 1 || truncationLevel > 10) + { + throw new InvalidOperationException("Currently metrics are only supported for 1 to 10 truncation levels."); + } + + // Uses reflection to set the truncation level before calling evaluate. + var mlAssembly = AppDomain.CurrentDomain.GetAssemblies().Where(a => a.FullName.Contains("Microsoft.ML.Data")).First(); + var rankEvalType = mlAssembly.DefinedTypes.Where(t => t.Name.Contains("RankingEvaluator")).First(); + + var evalArgsType = rankEvalType.GetNestedType("Arguments"); + var evalArgs = Activator.CreateInstance(rankEvalType.GetNestedType("Arguments")); + + var dcgLevel = evalArgsType.GetField("DcgTruncationLevel"); + dcgLevel.SetValue(evalArgs, truncationLevel); + + var ctor = rankEvalType.GetConstructors().First(); + var evaluator = ctor.Invoke(new object[] { mlContext, evalArgs }); + + var evaluateMethod = rankEvalType.GetMethod("Evaluate"); + RankingMetrics metrics = (RankingMetrics)evaluateMethod.Invoke(evaluator, new object[] { scoredData, "Label", "GroupId", "Score" }); + + Console.WriteLine($"DCG: {string.Join(", ", metrics.DiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}"); + + Console.WriteLine($"NDCG: {string.Join(", ", metrics.NormalizedDiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}"); + } + + // Prints out the the individual scores used to determine the relative ranking. + public static void PrintScores(IEnumerable predictions) + { + foreach (var prediction in predictions) + { + Console.WriteLine($"GroupId: {prediction.GroupId}, Score: {prediction.PredictedRank}"); + } + } + } +} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelData.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelData.cs new file mode 100644 index 000000000..eedff1aac --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelData.cs @@ -0,0 +1,230 @@ +using Microsoft.ML.Data; +using System; + +namespace PersonalizedRanking.DataStructures +{ + // Representation of the Expedia data set: https://www.kaggle.com/c/expedia-personalized-sort/data. Specifically, this is used for training and testing the model. + public class HotelData + { + // Maps to the "Srch_Id" column; this is the id of the search\query. + [LoadColumn(0)] + public uint GroupId { get; set; } + + // Maps to the "Date_Time" column; this is the date\time of the search. + [LoadColumn(1)] + public DateTime Srch_DateTime { get; set; } + + // Maps to the "Site_Id" column; this is the id of the Expedia point of sale (e.g. Expedia.com, Expedia.co.uk, etc.) + [LoadColumn(2)] + public float Site_Id { get; set; } + + // Maps to the "Visitor_Location_Country_Id" column; this is the id of the country the customer is located. + [LoadColumn(3)] + public float Visitor_Location_Country_Id { get; set; } + + // Mpas to the "Visitor_Hist_Starrating" column; this is the mean star rating of hotels the customer has previously purchased; null signifies there is no purchase history on the customer. + [LoadColumn(4)] + public float Visitor_Hist_Star_Rating { get; set; } + + // Maps to the "Visitor_Hist_Adr_USD" column; this is the mean price per night (in USD) of the hotesl the customer has previously puchases; null signifies there is no purchase history on the customer. + [LoadColumn(5)] + public float Visitor_Hist_Adr_USD { get; set; } + + // Maps to the "Prop_Country_Id" column; this is the id of the country the hotel is located in. + [LoadColumn(6)] + public float Prop_Country_Id { get; set; } + + // Maps to the "Prop_Id" column; this is the id of the hotel. + [LoadColumn(7)] + public float Prop_Id { get; set; } + + // Maps to the Prop_Starrating" column; this is the star rating of the hotel, from 1 to 5 in increments of 1. A 0 indicates the property has no starts, the star rating is not known or cannobe be publicized. + [LoadColumn(8)] + public float Prop_Star_Rating { get; set; } + + // Maps to the "Prop_Review_Score" column; this is the mean customer review score for the hotel on a scale out of 5, rounded to 0.5 increments. A 0 means there have been no reviews, null that the information is not available. + [LoadColumn(9)] + public float Prop_Review_Score { get; set; } + + // Maps to the "Prop_Bran_Bool" column; this has +1 if the hotel is part of a major hotel chain; 0 if it is an independent hotel. + [LoadColumn(10)] + public float Prop_Brand { get; set; } + + // Maps to the "Prop_Location_Score1" column; this is the first score outlining the desirability of a hotel's location. + [LoadColumn(11)] + public float Prop_Loc_Score1 { get; set; } + + // Maps to the "Prop_Location_Score2" column; this is the second score outlining the desirability of a hotel's location. + [LoadColumn(12)] + public float Prop_Loc_Score2 { get; set; } + + // Maps to the "Prop_Log_Historical_Price" column; this is the logarithm of the mean price of the hotel over the last trading period. A 0 will occur if the hotel was not sold in that period. + [LoadColumn(13)] + public float Prop_Log_Historical_Price { get; set; } + + // Maps to the "Position" column; this is the hotel position in Expedia's search results page. + [LoadColumn(14)] + public float Position { get; set; } + + // Maps to the "Price_USD" column; this is the displayed price of the hotel for the given search. Note that different countries have different conventions regarding displaying taxes and fees and the value may be per night or the whole stay. + [LoadColumn(15)] + public float Price_USD { get; set; } + + // Maps to the "Promotion_Flag" column; this has +1 if the hotel had a sale price promotion specifically displayed. + [LoadColumn(16)] + public float Promotion_Flag { get; set; } + + // Maps to the "Srch_Destination_Id" column; this is the id of the destination wher the hotel search was performed. + [LoadColumn(17)] + public float Srch_Destination_ID { get; set; } + + // Maps to the "Srch_Length_Of_Stay" column; this is the number of nights stay that was searched. + [LoadColumn(18)] + public float Srch_Length_Of_Stay { get; set; } + + // Maps to the "Srch_Booking_Window" column; this is the number of days in the future the hotel staty started from the search date. + [LoadColumn(19)] + public float Srch_Booking_Window { get; set; } + + // Maps to the "Srch_Adults_Count" column; this is the number of adults specified in the hotel room. + [LoadColumn(20)] + public float Srch_Adults_Count { get; set; } + + // Maps to the "Srch_Children_Count" column; this is the number of (extra occupancy) children specified in the hotel room. + [LoadColumn(21)] + public float Srch_Children_Count { get; set; } + + // Maps to the "Srch_Room_Count" column; this is the number of hotel rooms specified in the search. + [LoadColumn(22)] + public float Srch_Room_Count { get; set; } + + // Maps to the "Srch_Saturday_Night_Bool" column; this has +1 if the stay includs a Saturday night, starts from Thursday within a length of stay is less than or equal to 4 nights (e.g. weekend) - otherwise 0. + [LoadColumn(23)] + public float Srch_Saturday_Night { get; set; } + + // Maps to the "Srch_Query_Affility_Score" column; this is the log of the probability a hotel will be clicked on in internet searches (hence the values are negative). Null signifies there is no data (e.g. hotel did not register in any searches). + [LoadColumn(24)] + public float Srch_Query_Affinity_Score { get; set; } + + // Maps to the "Orig_Destination_Distance"; this is the physical distance between the hotel and the customer at the time of the search. A null means the distance could not be calculated. + [LoadColumn(25)] + public float Orig_Destination_Distance { get; set; } + + // Maps to the "Random_Bool" column; this is +1 when the displayed sort was random - 0 when the noraml sort order (determined by Expedia's algorithm) was displayed + [LoadColumn(26)] + public float Random_Position { get; set; } + + // Maps to the "Comp1_Rate" column; this is +1 if Expedia has a lwoer price than competitor 1 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 1. Null signifies there is no competitive data. + [LoadColumn(27)] + public float Comp1_Rate { get; set; } + + // Maps to the "Comp1_Inv" column; this is +1 if competitor 1 does not have availability in the hotel. Or, 0 if both Expedia and competitor 1 have availability. Null signifies there is no competitive data. + [LoadColumn(28)] + public float Comp1_Inv { get; set; } + + // Maps to "Comp1_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 1's price (Expedia's price the denominator). Null signifies there is no competitive data. + [LoadColumn(29)] + public float Comp1_Rate_Percent_Diff { get; set; } + + // Maps to the "Comp2_Rate" column; this is +1 if Expedia has a lwoer price than competitor 2 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 2. Null signifies there is no competitive data. + [LoadColumn(30)] + public float Comp2_Rate { get; set; } + + // Maps to the "Comp2_Inv" column; this is +1 if competitor 2 does not have availability in the hotel. Or, 0 if both Expedia and competitor 2 have availability. Null signifies there is no competitive data. + [LoadColumn(31)] + public float Comp2_Inv { get; set; } + + // Maps to "Comp2_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 2's price (Expedia's price the denominator). Null signifies there is no competitive data. + [LoadColumn(32)] + public float Comp2_Rate_Percent_Diff { get; set; } + + // Maps to the "Comp3_Rate" column; this is +1 if Expedia has a lwoer price than competitor 3 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 3. Null signifies there is no competitive data. + [LoadColumn(33)] + public float Comp3_Rate { get; set; } + + // Maps to the "Comp3_Inv" column; this is +1 if competitor 3 does not have availability in the hotel. Or, 0 if both Expedia and competitor 3 have availability. Null signifies there is no competitive data. + [LoadColumn(34)] + public float Comp3_Inv { get; set; } + + // Maps to "Comp3_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 3's price (Expedia's price the denominator). Null signifies there is no competitive data. + [LoadColumn(35)] + public float Comp3_Rate_Percent_Diff { get; set; } + + // Maps to the "Comp4_Rate" column; this is +1 if Expedia has a lwoer price than competitor 4 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 4. Null signifies there is no competitive data. + [LoadColumn(36)] + public float Comp4_Rate { get; set; } + + // Maps to the "Comp4_Inv" column; this is +1 if competitor 4 does not have availability in the hotel. Or, 0 if both Expedia and competitor 4 have availability. Null signifies there is no competitive data. + [LoadColumn(37)] + public float Comp4_Inv { get; set; } + + // Maps to "Comp4_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 4's price (Expedia's price the denominator). Null signifies there is no competitive data. + [LoadColumn(38)] + public float Comp4_Rate_Percent_Diff { get; set; } + + // Maps to the "Comp5_Rate" column; this is +1 if Expedia has a lwoer price than competitor 5 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 5. Null signifies there is no competitive data. + [LoadColumn(39)] + public float Comp5_Rate { get; set; } + + // Maps to the "Comp5_Inv" column; this is +1 if competitor 5 does not have availability in the hotel. Or, 0 if both Expedia and competitor 5 have availability. Null signifies there is no competitive data. + [LoadColumn(40)] + public float Comp5_Inv { get; set; } + + // Maps to "Comp5_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 5's price (Expedia's price the denominator). Null signifies there is no competitive data. + [LoadColumn(41)] + public float Comp5_Rate_Percent_Diff { get; set; } + + // Maps to the "Comp6_Rate" column; this is +1 if Expedia has a lwoer price than competitor 6 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 6. Null signifies there is no competitive data. + [LoadColumn(42)] + public float Comp6_Rate { get; set; } + + // Maps to the "Comp6_Inv" column; this is +1 if competitor 6 does not have availability in the hotel. Or, 0 if both Expedia and competitor 6 have availability. Null signifies there is no competitive data. + [LoadColumn(43)] + public float Comp6_Inv { get; set; } + + // Maps to "Comp6_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 6's price (Expedia's price the denominator). Null signifies there is no competitive data. + [LoadColumn(44)] + public float Comp6_Rate_Percent_Diff { get; set; } + + // Maps to the "Comp7_Rate" column; this is +1 if Expedia has a lwoer price than competitor 7 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 7. Null signifies there is no competitive data. + [LoadColumn(45)] + public float Comp7_Rate { get; set; } + + // Maps to the "Com72_Inv" column; this is +1 if competitor 7 does not have availability in the hotel. Or, 0 if both Expedia and competitor 7 have availability. Null signifies there is no competitive data. + [LoadColumn(46)] + public float Comp7_Inv { get; set; } + + // Maps to "Comp7_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 7's price (Expedia's price the denominator). Null signifies there is no competitive data. + [LoadColumn(47)] + public float Comp7_Rate_Percent_Diff { get; set; } + + // Maps to the "Comp8_Rate" column; this is +1 if Expedia has a lwoer price than competitor 8 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 8. Null signifies there is no competitive data. + [LoadColumn(48)] + public float Comp8_Rate { get; set; } + + // Maps to the "Comp8_Inv" column; this is +1 if competitor 8 does not have availability in the hotel. Or, 0 if both Expedia and competitor 8 have availability. Null signifies there is no competitive data. + [LoadColumn(49)] + public float Comp8_Inv { get; set; } + + // Maps to "Comp8_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 8's price (Expedia's price the denominator). Null signifies there is no competitive data. + [LoadColumn(50)] + public float Comp8_Rate_Percent_Diff { get; set; } + + // Maps to the "Click_Bool" column; this is +1 if the user clicked through to see more information on this hotel. + [LoadColumn(51)] + public float Srch_Result_Clicked { get; set; } + + // Maps to the "Gross_Booking_USD" column; this it eh total value of the transaction. This can differ from the price_us due to taxes, fees, conventions on multiple day booking and purchase of a room type otehr than the one shown. + [LoadColumn(52)] + public float Gross_Bookings_USD { get; set; } + + // Maps to the "Booking_Bool" column; this is +1 if the user purchases a room at this hotel. + [LoadColumn(53)] + public float Srch_Result_Booked { get; set; } + + // The "Label" does not exist in the underlying Expedia dataset and is added in the sample to indicate the ideal rank (e.g. predicted value) of a hotel search result. + // This is 2 if the user purchased\booked a room at this hotel. Or, is 1 if the user clicked through to see more information on this hotel. Otherwise, is 0 if the user neither clicked nor purchased\booked a room at this hotel. + [LoadColumn(54)] + public uint Label { get; set; } + } +} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs new file mode 100644 index 000000000..5c9ec7ef3 --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs @@ -0,0 +1,22 @@ + +using Microsoft.ML.Data; + +namespace PersonalizedRanking.DataStructures +{ + // Representation of the prediction made by the model (e.g. ranker). + public class HotelPrediction + { + // Maps to the "Srch_Id" column in the underlying Expedia dataset; this is the id of the search\query. + public uint GroupId { get; set; } + + // The "Label" does not exist in the underlying Expedia dataset and is added in the sample to indicate the ideal rank of a hotel search result. This is 2 if the user purchased\booked a room at this hotel. Or, is 1 if the user clicked through to see more information on this hotel. Otherwise, is 0 if the user neither clicked nor purchased\booked a room at this hotel. + public uint Label { get; set; } + + // Prediction made by the model that indicates the relative ranking of the hotel search result. + [ColumnName("Score")] + public float PredictedRank { get; set; } + + // Values that are influential in determining the relevance of a data instance. This is a vector that contains concatenated columns from the underlying Expedia dataset. + public float[] Features { get; set; } + } +} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelRelevance.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelRelevance.cs new file mode 100644 index 000000000..c085e970b --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelRelevance.cs @@ -0,0 +1,10 @@ + +namespace PersonalizedRanking.DataStructures +{ + // Used by ML .NET to do a custom mapping to add the "Label" column to the dataset. + public class HotelRelevance + { + // The "Label" does not exist in the underlying Expedia dataset and is added in the sample to indicate the ideal rank of a hotel search result. This is 2 if the user purchased\booked a room at this hotel. Or, is 1 if the user clicked through to see more information on this hotel. Otherwise, is 0 if the user neither clicked nor purchased\booked a room at this hotel. + public uint Label { get; set; } + } +} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Mapper.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Mapper.cs new file mode 100644 index 000000000..479e2090b --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Mapper.cs @@ -0,0 +1,34 @@ +using Microsoft.ML; +using PersonalizedRanking.DataStructures; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace PersonalizedRanking +{ + public class Mapper + { + // Custom mapper used to label a hotel search result with the ideal rank. + // This is based on guidelines provided by Expedia: https://www.kaggle.com/c/expedia-personalized-sort/overview/evaluation. + public static Action GetLabelMapper(MLContext mlContext, IDataView data) + { + Action mapper = (input, output) => + { + if (input.Srch_Result_Booked == 1) + { + output.Label = 2; + } + else if (input.Srch_Result_Clicked == 1) + { + output.Label = 1; + } + else + { + output.Label = 0; + } + }; + + return mapper; + } + } +} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj new file mode 100644 index 000000000..90bd2688b --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj @@ -0,0 +1,13 @@ + + + + Exe + netcoreapp2.2 + + + + + + + + diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs new file mode 100644 index 000000000..f29d2dde7 --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -0,0 +1,188 @@ +using Microsoft.ML; +using Microsoft.ML.Trainers.LightGbm; +using PersonalizedRanking.Common; +using PersonalizedRanking.DataStructures; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using static Microsoft.ML.DataOperationsCatalog; + +namespace PersonalizedRanking +{ + class Program + { + const string AssetsPath = @"../../../Assets"; + static string TrainDatasetPath = Path.Combine(AssetsPath, "InputData_Train.csv"); + static string TestDatasetPath = Path.Combine(AssetsPath, "InputData_Test.csv"); + static string ModelPath = Path.Combine(AssetsPath, "RankingModel.csv"); + + static string OriginalDatasetPath = Path.Combine(AssetsPath, "Train.csv"); + static string OriginalExampleDatasetPath = Path.Combine(AssetsPath, "Test.csv"); + + static void Main(string[] args) + { + // Create a common ML.NET context. + // Seed set to any number so you have a deterministic environment for repeateable results. + MLContext mlContext = new MLContext(seed: 0); + + try + { + PrepDatasets(mlContext, AssetsPath, OriginalDatasetPath, TrainDatasetPath, TestDatasetPath); + + var model = TrainModel(mlContext, TrainDatasetPath, ModelPath); + + EvaluateModel(mlContext, model, TestDatasetPath); + + ConsumeModel(mlContext, ModelPath, OriginalExampleDatasetPath); + } + catch (Exception e) + { + Console.WriteLine(e.Message); + } + + Console.ReadLine(); + } + + static void PrepDatasets(MLContext mlContext, string assetPath, string originalDatasetPath, string trainDatasetPath, string testDatasetPath) + { + const string DatasetUrl = "https://www.kaggle.com/c/expedia-personalized-sort/download/data.zip"; + + if (!File.Exists(trainDatasetPath) || !File.Exists(testDatasetPath)) + { + if (!File.Exists(originalDatasetPath)) + { + throw new InvalidOperationException($"This samples requires the Expedia dataset. Please ensure that you have downloaded and extracted the contents of the .zip file to the following directory: {assetPath}. The .zip file can be downloaded from here: {DatasetUrl}"); + } + + Console.WriteLine("===== Prepare the testing/training datasets ====="); + + // Load dataset using TextLoader by specifying the type name that holds the data's schema to be mapped with datasets. + IDataView data = mlContext.Data.LoadFromTextFile(originalDatasetPath, separatorChar: ',', hasHeader: true); + + Console.WriteLine("===== Label the dataset with ideal ranking value ====="); + + // Create an Estimator and use a custom mapper to transform label hotel instances to values 0, 1, or 2. + IEstimator dataPipeline = mlContext.Transforms.CustomMapping(Mapper.GetLabelMapper(mlContext, data), null); + + // To transform the data, call the Fit() method. + ITransformer dataTransformer = dataPipeline.Fit(data); + IDataView labeledData = dataTransformer.Transform(data); + + Console.WriteLine("===== Split the data into testing/training datasets ====="); + + // When splitting the data, 20% is held for the test dataset. + // To avoid label leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. + // This ensures that if two or more hotel instances share the same GroupId, that they are guaranteed to appear in the same subset of data (train or test). + TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: .2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); + IDataView trainData = trainTestData.TrainSet; + IDataView testData = trainTestData.TestSet; + + Console.WriteLine("===== Save the testing/training datasets ====="); + + // Save the test dataset to a file to make it faster to load in subsequent runs. + using (var fileStream = File.Create(trainDatasetPath)) + { + mlContext.Data.SaveAsText(trainData, fileStream, separatorChar: ',', headerRow: true, schema: true); + } + + // Save the train dataset to a file to make it faster to load in subsequent runs. + using (var fileStream = File.Create(testDatasetPath)) + { + mlContext.Data.SaveAsText(testData, fileStream, separatorChar: ',', headerRow: true, schema: true); + } + } + } + + static ITransformer TrainModel(MLContext mlContext, string trainDatasetPath, string modelPath) + { + const string FeaturesVectorName = "Features"; + + Console.WriteLine("===== Load the training dataset ====="); + + // Load the training dataset. + IDataView trainData = mlContext.Data.LoadFromTextFile(trainDatasetPath, separatorChar: ',', hasHeader: true); + + Console.WriteLine("===== Set up the trainer ====="); + + // Specify the columns to include in the feature input data. + var featureCols = trainData.Schema.AsQueryable() + .Select(s => s.Name) + .Where(c => + c == nameof(HotelData.Price_USD) || + c == nameof(HotelData.Promotion_Flag) || + c == nameof(HotelData.Prop_Id) || + c == nameof(HotelData.Prop_Brand) || + c == nameof(HotelData.Prop_Review_Score)) + .ToArray(); + + // Set trainer options. + LightGbmRankingTrainer.Options options = new LightGbmRankingTrainer.Options(); + options.CustomGains = new int[] { 0, 1, 5 }; + options.RowGroupColumnName = nameof(HotelData.GroupId); + options.LabelColumnName = nameof(HotelData.Label); + options.FeatureColumnName = FeaturesVectorName; + + // Create an Estimator and transform the data: + // 1. Concatenate the feature columns into a single Features vector. + // 2. Create a key type for the label input data by using the value to key transform. + // 3. Create a key type for the group input data by using a hash transform. TODO: Verify that we can't use a key type mapping here??? + IEstimator dataPipeline = mlContext.Transforms.Concatenate(FeaturesVectorName, featureCols) + .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) + .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); + + // Set the LightGbm Lambdarank trainer. + IEstimator trainer = mlContext.Ranking.Trainers.LightGbm(options); + IEstimator trainerPipeline = dataPipeline.Append(trainer); + + Console.WriteLine("===== Train the model ====="); + + // Training the model is a process of running the chosen algorithm on the given data. To perform training you need to call the Fit() method. + ITransformer model = trainerPipeline.Fit(trainData); + + Console.WriteLine("===== Save the model ====="); + + // Save the model + mlContext.Model.Save(model, trainData.Schema, modelPath); + + return model; + } + + static void EvaluateModel(MLContext mlContext, ITransformer model, string testDatasetPath) + { + Console.WriteLine("===== Evaluate the model's accuracy with test data ====="); + + // Load the test data and use the model to perform predictions on the test data. + IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: ',', hasHeader: true); + IDataView predictions = model.Transform(testData); + + // Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). + ConsoleHelper.EvaluateMetrics(mlContext, predictions); + + // Evaluate metrics for up to 10 search results (e.g. NDCG@10); + ConsoleHelper.EvaluateMetrics(mlContext, predictions, 10); + } + + public static void ConsumeModel(MLContext mlContext, string modelPath, string exampleDatasetPath) + { + Console.WriteLine("===== Consume the model ====="); + + DataViewSchema predictionPipelineSchema; + ITransformer predictionPipeline = mlContext.Model.Load(modelPath, out predictionPipelineSchema); + + // Load example data and use the model to perform predictions on it. + IDataView exampleData = mlContext.Data.LoadFromTextFile(exampleDatasetPath, separatorChar: ',', hasHeader: true); + + // Predict rankings. + IDataView predictions = predictionPipeline.Transform(exampleData); + + // In the predictions, get the scores of the hotel search results included in the first query (e.g. group). + IEnumerable hotelQueries = mlContext.Data.CreateEnumerable(predictions, reuseRowObject: false); + var firstGroupId = hotelQueries.First().GroupId; + IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.PredictedRank).ToList(); + + // The individual scores themselves are NOT a useful measure of accuracy; insteady, they are used to determine the ranking where a higher score indicates a higher ranking. + ConsoleHelper.PrintScores(firstGroupPredictions); + } + } +} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md new file mode 100644 index 000000000..f9a893e28 --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md @@ -0,0 +1,216 @@ +# Rank hotel search results to provide personalized sorting + +| ML.NET version | API type | Status | App Type | Data type | Scenario | ML Task | Algorithms | +|----------------|-------------------|-------------------------------|-------------|-----------|---------------------|---------------------------|-----------------------------| +| v1.1.0 | Dynamic API | Up-to-date | Console app | .csv file | Ranking hotel search results | Ranking | LightGbm | + +This introductory sample shows how to use ML.NET to predict the relevance and order of hotel search results. In the world of machine learning, this type of prediction is known as ranking. + +## Problem +The ability to perform ranking is a common problem faced by search engines since users expect query results to be ranked\sorted according to their relevance. This problem extends beyond the needs of search engines to include a variety of business scenarios where personalized sorting is key to the user experience. Here are a few specific examples: +* Travel Agency - Provide a list of hotels with those that are most likely to be purchased\booked by the user positioned highest in the list. +* Shopping - Display items from a product catalog in an order that aligns with a user's shopping preferences. +* Recruiting - Retrieve job applications ranked according to the candidates that are most qualified for a new job opening. + +Ranking is useful to any scenario where it is important to list items in an order that increases the likelihood of a click, purchase, reservation, etc. + +In this sample, we show how to apply ranking to the first example listed above to rank hotel search results according to the likelihood that the hotel will be purchased\booked by the user. To perform ranking, there are two algorithms currently available - FastTree Boosting (FastRank) and Light Gradient Boosting (LightGbm). We use the LightGbm's Lambdarank algorithm in this sample to automatically build an ML model to predict ranking. + +## Dataset +The training and testing data used by this sample is based on a public [dataset available at Kaggle](https://www.kaggle.com/c/expedia-personalized-sort) originally provided by Expedia (https://www.expedia.com). + +Expedia's datasets consist of hotel search results that are grouped according to a user's query; each hotel result includes the following details: +* Hotel attributes, such as location attractiveness and price. +* User's criteria for searching hotels, such as the number of rooms\children\adults, length of stay, etc. +* User's purchase and browsing history, such as whether they clicked the link of a hotel or purchased\booked it. +* Information on similar competitor hotel offerings. + +## ML Task - Ranking +As previously mentioned, this sample uses the LightGbm Lambdarank algorithm which is applied using a supervised learning technique known as "Learning to Rank". This technique requires that train/test datasets contain groups of data instances that are labeled with their ideal ranking value. The label is a numerical\ordinal value, such as {4, 3, 2, 1, 0} or a text value {"Perfect", "Excellent", "Good", "Fair", or "Bad"}. The process for labeling these data instances with their ideal ranking value can be done manually by subject matter experts. Or, the labels can be determined using other metrics, such as the number of clicks on a given search result. This sample uses the latter approach. + +Once the train/test datasets are labeled with ideal ranking values, the model (e.g. ranker) can then be trained and tested using this data. Through the model training process, the ranker learns how to score each data instance within a group based on their label value. The resulting score of an individual data instance by itself isn't important -- instead, the scores should be compared against one another to determine the relative ordering of a group's data instances. The higher the score a data instance has, the more relevant and more highly ranked it is within its group. + +## Solution +The sample performs the following high-level steps to rank Expedia hotel search results: +1. Each hotel search result is **labeled** with its ideal ranking value. +2. Once the dataset is labeled, the data is **split** into training and testing datasets. +3. The model is **trained** using the train dataset using LightGbm Lambdarank algorithm. +4. The model is **tested** using the test dataset. This results in a **prediction** that includes a **score** for each hotel instance. The score is used to determine the ranking relative to other hotels within the same query (e.g. group). The predictions are then **evaluated** by examining metrics; specifically the [Discounted Cumulative Gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain). +5. The final step is to **consume** the model to perform ranking predictions for new incoming hotel searches. + +### 1. Label Data +To label the data with ideal ranking values, the sample follows [Expedia's evaluation guidance](https://www.kaggle.com/c/expedia-personalized-sort/overview/evaluation): + +* 0 - The user neither clicked on this hotel nor purchased\booked a room at this hotel. +* 1 - The user clicked through to see more information on this hotel. +* 2 - The user purchased\booked a room at this hotel. + +Expedia's dataset includes both **Click_Bool** and **Booking_Bool** columns that indicate whether the user has clicked or purchased/booked a hotel. Applying the above guidelines to these columns, we create a new "Label" column that contains values {0, 1, 2} for each hotel search result. + +The code for labeling the data is similar to the following: + +```CSharp +// Load dataset using TextLoader by specifying the type name that holds the data's schema to be mapped with datasets. +IDataView data = mlContext.Data.LoadFromTextFile(originalDatasetPath, separatorChar: ',', hasHeader: true); + +// Create an Estimator and use a custom mapper to transform label hotel instances to values 0, 1, or 2. +IEstimator dataPipeline = mlContext.Transforms.CustomMapping(Mapper.GetLabelMapper(mlContext, data), null); + +// To transform the data, call the Fit() method. +ITransformer dataTransformer = dataPipeline.Fit(data); +IDataView labeledData = dataTransformer.Transform(data); + +[...] + +// Custom mapper used to label a hotel search result with the ideal rank. +public static Action GetLabelMapper(MLContext mlContext, IDataView data) +{ + Action mapper = (input, output) => + { + if (input.Srch_Result_Booked == 1) + { + output.Label = 2; + } + else if (input.Srch_Result_Clicked == 1) + { + output.Label = 1; + } + else + { + output.Label = 0; + } + }; + + return mapper; +} +````` +### 2. Split Data + With the data properly labeled, it is ready to be split into the train/test datasets. When splitting the data, it's important to make sure that the hotel search results for a given query aren't split across the two datasets. This would cause label leakage where the same query in our training dataset also exists within the testing dataset. + + Refer to the following code which shows how to split the data: + + ```CSharp +// When splitting the data, 20% is held for the test dataset. +// To avoid label leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. +// This ensures that if two or more hotel instances share the same GroupId, that they are guaranteed to appear in the same subset of data (train or test). +TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: .2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); +IDataView trainData = trainTestData.TrainSet; +IDataView testData = trainTestData.TestSet; + +// Save the test dataset to a file to make it faster to load in subsequent runs. +using (var fileStream = File.Create(trainDatasetPath)) +{ + mlContext.Data.SaveAsText(trainData, fileStream, separatorChar: ',', headerRow: true, schema: true); +} + +// Save the train dataset to a file to make it faster to load in subsequent runs. +using (var fileStream = File.Create(testDatasetPath)) +{ + mlContext.Data.SaveAsText(testData, fileStream, separatorChar: ',', headerRow: true, schema: true); +} +````` + +### 3. Train Model +This sample trains the model using the LightGbmRankingTrainer which relies on the LightGbm Lambdarank algorithm. The model requires the following inputs: + +* Group Id - Data instances are contained in logical groupings and each group has an identifier known as the group id. In the case of the Expedia dataset, hotel search results are grouped by their corresponding query where the Group Id corresponds to the query or search id. The input group data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype). +* Label: Ideal rank (e.g. degree of relevance) of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). +* Features: Values that are influential in determining the relevance\rank of a data instance. The input feature data must be a fixed size vector of type [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). + +When the trainer is set, custom gains are used to apply weights to each of the rank values used in labels. As described earlier in the sample, the labels contain values {0, 1, 2} which directly correlate to the specified gains {0, 1, 5}. This helps to ensure that the model will place more emphasis on ranking hotel search results labeled with 2 (e.g. signifies the user purchased\booked the hotel) so that they are positioned higher when compared to results labeled with 0 or 1. + +The following code is used to train the model: + +```CSharp +const string FeaturesVectorName = "Features"; + +// Load the training dataset. +IDataView trainData = mlContext.Data.LoadFromTextFile(trainDatasetPath, separatorChar: ',', hasHeader: true); + +// Specify the columns to include in the feature input data. +var featureCols = trainData.Schema.AsQueryable() + .Select(s => s.Name) + .Where(c => + c == nameof(HotelData.Price_USD) || + c == nameof(HotelData.Promotion_Flag) || + c == nameof(HotelData.Prop_Id) || + c == nameof(HotelData.Prop_Brand) || + c == nameof(HotelData.Prop_Review_Score)) + .ToArray(); + +// Set trainer options. +LightGbmRankingTrainer.Options options = new LightGbmRankingTrainer.Options(); +options.CustomGains = new int[] { 0, 1, 5 }; +options.RowGroupColumnName = nameof(HotelData.GroupId); +options.LabelColumnName = nameof(HotelData.Label); +options.FeatureColumnName = FeaturesVectorName; + +// Create an Estimator and transform the data: +// 1. Concatenate the feature columns into a single Features vector. +// 2. Create a key type for the label input data by using the value to key transform. +// 3. Create a key type for the group input data by using a hash transform. +IEstimator dataPipeline = mlContext.Transforms.Concatenate(FeaturesVectorName, featureCols) + .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) + .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); + +// Set the LightGbm Lambdarank trainer. +IEstimator trainer = mlContext.Ranking.Trainers.LightGbm(options); +IEstimator trainerPipeline = dataPipeline.Append(trainer); + +// Training the model is a process of running the chosen algorithm on the given data. To perform training you need to call the Fit() method. +ITransformer model = trainerPipeline.Fit(trainData); + +// Save the model + mlContext.Model.Save(model, trainData.Schema, modelPath); +````` + +### 4. Test and Evaluate Model +We need this step to conclude how accurate our model is. To do so, the model from the previous step is run against another dataset that was not used in training (e.g. the test dataset). + +`Evaluate()` compares the predicted values for the test dataset and produces various metrics, such as accuracy, you can explore. Specifically, we can gauge the accuracy of our model using Discounted Cumulative Gain (DCG) and Normalized Discounted Cumulative Gain (NDCG) which are included in the `RankingMetrics` returned by `Evaluate()`. + +When evaluating the `RankingMetrics` for this sample's model, you'll notice that the following metrics are reported for DCG and NDCG: +* DCG - @1:1.0191, @2:1.5128, @3:1.8371, @4:2.0922, @5:2.2982, @6:2.4641, @7:2.6051, @8:2.7240, @9:2.8234, @10:2.9133 + +* NDCG - @1:0.1184, @2:0.1719, @3:0.2082, @4:0.2372, @5:0.2608, @6:0.2798, @7:0.2960, @8:0.3096, @9:0.3210, @10:0.3314 + +The NDCG values are most useful to examine since this allows us to compare accuracy across different queries. The potential value of NDCG ranges from **0.0** to **1.0**, with 1.0 being a perfect model that exactly matches the ideal ranking. + +With this in mind, let's look at our model's values for NDCG. In particular, let's look at the value for **NDCG@10** which is **.3314**. This is the average NDCG for a query returning the top **10** hotel search results. While **.3314** may seem low compared to **1.0**, a more realistic goal is to reach **.5407** which is the score of the first place winner in [Expedia's Personalize Hotel Search contest on Kaggle](https://www.kaggle.com/c/expedia-personalized-sort/leaderboard). To increase the model's accuracy, you would need to continue experimenting with feature engineering improvements. + +Refer to the following code used to test and evaluate the model: + +```CSharp +// Load the test data and use the model to perform predictions on the test data. +IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: ',', hasHeader: true); +IDataView predictions = model.Transform(testData); + +// Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). +ConsoleHelper.EvaluateMetrics(mlContext, predictions); + +// Evaluate metrics for up to 10 search results (e.g. NDCG@10); +ConsoleHelper.EvaluateMetrics(mlContext, predictions, 10); +````` + +### 5. Consume Model + +After the model is built and trained, we can use the `Predict()` API to predict the ranking of hotel search results for a user query. + +```CSharp +DataViewSchema predictionPipelineSchema; +ITransformer predictionPipeline = mlContext.Model.Load(modelPath, out predictionPipelineSchema); + +// Load example data and use the model to perform predictions on it. +IDataView exampleData = mlContext.Data.LoadFromTextFile(exampleDatasetPath, separatorChar: ',', hasHeader: true); + +// Predict rankings. +IDataView predictions = predictionPipeline.Transform(exampleData); + +// In the predictions, get the scores of the hotel search results included in the first query (e.g. group). +IEnumerable hotelQueries = mlContext.Data.CreateEnumerable(predictions, reuseRowObject: false); +var firstGroupId = hotelQueries.First().GroupId; +IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.PredictedRank).ToList(); + +// The individual scores themselves are NOT a useful measure of accuracy; insteady, they are used to determine the ranking where a higher score indicates a higher ranking. +ConsoleHelper.PrintScores(firstGroupPredictions); +````` \ No newline at end of file diff --git a/samples/csharp/v1.0.0-All-Samples.sln b/samples/csharp/v1.0.0-All-Samples.sln index 7d466c25c..4c034e397 100644 --- a/samples/csharp/v1.0.0-All-Samples.sln +++ b/samples/csharp/v1.0.0-All-Samples.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.28803.452 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 MinimumVisualStudioVersion = 10.0.40219.1 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BikeSharingDemand.Solution", "BikeSharingDemand.Solution", "{820E8AF2-A47D-4AB8-A4AF-5CDFF97EBCDF}" EndProject @@ -129,6 +129,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TFImageClassififcationE2E.S EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TensorFlowImageClassification", "end-to-end-apps\DeepLearning_ImageClassification_TensorFlow\TensorFlowImageClassification\TensorFlowImageClassification.csproj", "{C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PersonalizedRanking.Solution", "PersonalizedRanking.Solution", "{B76DD928-A78E-497C-BA7D-83C5864452F9}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PersonalizedRanking", "getting-started\Ranking_PersonalizedSort\PersonalizedRanking.csproj", "{C5886C5F-539A-4B9D-A03A-9C5B57E77763}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -425,6 +429,14 @@ Global {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|Any CPU.Build.0 = Release|Any CPU {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|x64.ActiveCfg = Release|Any CPU {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|x64.Build.0 = Release|Any CPU + {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Debug|x64.ActiveCfg = Debug|Any CPU + {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Debug|x64.Build.0 = Debug|Any CPU + {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Release|Any CPU.Build.0 = Release|Any CPU + {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Release|x64.ActiveCfg = Release|Any CPU + {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Release|x64.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -466,6 +478,7 @@ Global {EA9E37C6-8C62-4370-A9CF-369D002B89B6} = {7C3A7DA5-CBEB-420F-B7AC-CDE34BE2D52E} {F2C0FCE9-9F76-4318-826E-892441E4A169} = {EF9F8695-25DE-4FE4-894A-6DE24E0BDD73} {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47} = {F59681C2-D829-4538-A41A-568F7A7D07FD} + {C5886C5F-539A-4B9D-A03A-9C5B57E77763} = {B76DD928-A78E-497C-BA7D-83C5864452F9} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {98369941-33DD-450C-A410-B9A91C8CDE91} From 6cbfc2764e7835f3e5e74e1d2b104f295977c83e Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Mon, 24 Jun 2019 20:12:31 -0500 Subject: [PATCH 02/12] removed todo --- .../Ranking_PersonalizedSort/PersonalizedRanking/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs index f29d2dde7..1af57544e 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -126,7 +126,7 @@ static ITransformer TrainModel(MLContext mlContext, string trainDatasetPath, str // Create an Estimator and transform the data: // 1. Concatenate the feature columns into a single Features vector. // 2. Create a key type for the label input data by using the value to key transform. - // 3. Create a key type for the group input data by using a hash transform. TODO: Verify that we can't use a key type mapping here??? + // 3. Create a key type for the group input data by using a hash transform. IEstimator dataPipeline = mlContext.Transforms.Concatenate(FeaturesVectorName, featureCols) .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); From 9ff9c37f1bfca03df35cee56e7380fff597ad88b Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Tue, 25 Jun 2019 09:53:44 -0500 Subject: [PATCH 03/12] Fixed wording in ReadMe --- .../PersonalizedRanking/Program.cs | 75 ++++++++++--------- .../Ranking_PersonalizedSort/README.md | 20 ++--- 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs index 1af57544e..eeb37b5de 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -6,6 +6,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Net; using static Microsoft.ML.DataOperationsCatalog; namespace PersonalizedRanking @@ -13,12 +14,12 @@ namespace PersonalizedRanking class Program { const string AssetsPath = @"../../../Assets"; - static string TrainDatasetPath = Path.Combine(AssetsPath, "InputData_Train.csv"); - static string TestDatasetPath = Path.Combine(AssetsPath, "InputData_Test.csv"); - static string ModelPath = Path.Combine(AssetsPath, "RankingModel.csv"); + readonly static string TrainDatasetPath = Path.Combine(AssetsPath, "InputData_Train.csv"); + readonly static string TestDatasetPath = Path.Combine(AssetsPath, "InputData_Test.csv"); + readonly static string ModelPath = Path.Combine(AssetsPath, "RankingModel.csv"); - static string OriginalDatasetPath = Path.Combine(AssetsPath, "Train.csv"); - static string OriginalExampleDatasetPath = Path.Combine(AssetsPath, "Test.csv"); + readonly static string OriginalDatasetPath = Path.Combine(AssetsPath, "Train.csv"); + readonly static string OriginalExampleDatasetPath = Path.Combine(AssetsPath, "Test.csv"); static void Main(string[] args) { @@ -48,50 +49,50 @@ static void PrepDatasets(MLContext mlContext, string assetPath, string originalD { const string DatasetUrl = "https://www.kaggle.com/c/expedia-personalized-sort/download/data.zip"; - if (!File.Exists(trainDatasetPath) || !File.Exists(testDatasetPath)) - { - if (!File.Exists(originalDatasetPath)) + if (!File.Exists(trainDatasetPath) || !File.Exists(testDatasetPath)) { - throw new InvalidOperationException($"This samples requires the Expedia dataset. Please ensure that you have downloaded and extracted the contents of the .zip file to the following directory: {assetPath}. The .zip file can be downloaded from here: {DatasetUrl}"); - } + if (!File.Exists(originalDatasetPath)) + { + throw new InvalidOperationException($"This samples requires the Expedia dataset. Please ensure that you have downloaded and extracted the contents of the .zip file to the following directory: {assetPath}. The .zip file can be downloaded from here: {DatasetUrl}"); + } - Console.WriteLine("===== Prepare the testing/training datasets ====="); + Console.WriteLine("===== Prepare the testing/training datasets ====="); - // Load dataset using TextLoader by specifying the type name that holds the data's schema to be mapped with datasets. - IDataView data = mlContext.Data.LoadFromTextFile(originalDatasetPath, separatorChar: ',', hasHeader: true); + // Load dataset using TextLoader by specifying the type name that holds the data's schema to be mapped with datasets. + IDataView data = mlContext.Data.LoadFromTextFile(originalDatasetPath, separatorChar: ',', hasHeader: true); - Console.WriteLine("===== Label the dataset with ideal ranking value ====="); + Console.WriteLine("===== Label the dataset with ideal ranking value ====="); - // Create an Estimator and use a custom mapper to transform label hotel instances to values 0, 1, or 2. - IEstimator dataPipeline = mlContext.Transforms.CustomMapping(Mapper.GetLabelMapper(mlContext, data), null); + // Create an Estimator and use a custom mapper to transform label hotel instances to values 0, 1, or 2. + IEstimator dataPipeline = mlContext.Transforms.CustomMapping(Mapper.GetLabelMapper(mlContext, data), null); - // To transform the data, call the Fit() method. - ITransformer dataTransformer = dataPipeline.Fit(data); - IDataView labeledData = dataTransformer.Transform(data); + // To transform the data, call the Fit() method. + ITransformer dataTransformer = dataPipeline.Fit(data); + IDataView labeledData = dataTransformer.Transform(data); - Console.WriteLine("===== Split the data into testing/training datasets ====="); + Console.WriteLine("===== Split the data into testing/training datasets ====="); - // When splitting the data, 20% is held for the test dataset. - // To avoid label leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. - // This ensures that if two or more hotel instances share the same GroupId, that they are guaranteed to appear in the same subset of data (train or test). - TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: .2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); - IDataView trainData = trainTestData.TrainSet; - IDataView testData = trainTestData.TestSet; + // When splitting the data, 20% is held for the test dataset. + // To avoid label leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. + // This ensures that if two or more hotel instances share the same GroupId, that they are guaranteed to appear in the same subset of data (train or test). + TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: .2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); + IDataView trainData = trainTestData.TrainSet; + IDataView testData = trainTestData.TestSet; - Console.WriteLine("===== Save the testing/training datasets ====="); + Console.WriteLine("===== Save the testing/training datasets ====="); - // Save the test dataset to a file to make it faster to load in subsequent runs. - using (var fileStream = File.Create(trainDatasetPath)) - { - mlContext.Data.SaveAsText(trainData, fileStream, separatorChar: ',', headerRow: true, schema: true); - } + // Save the test dataset to a file to make it faster to load in subsequent runs. + using (var fileStream = File.Create(trainDatasetPath)) + { + mlContext.Data.SaveAsText(trainData, fileStream, separatorChar: ',', headerRow: true, schema: true); + } - // Save the train dataset to a file to make it faster to load in subsequent runs. - using (var fileStream = File.Create(testDatasetPath)) - { - mlContext.Data.SaveAsText(testData, fileStream, separatorChar: ',', headerRow: true, schema: true); + // Save the train dataset to a file to make it faster to load in subsequent runs. + using (var fileStream = File.Create(testDatasetPath)) + { + mlContext.Data.SaveAsText(testData, fileStream, separatorChar: ',', headerRow: true, schema: true); + } } - } } static ITransformer TrainModel(MLContext mlContext, string trainDatasetPath, string modelPath) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md index f9a893e28..3f75e1b6b 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md @@ -45,7 +45,7 @@ To label the data with ideal ranking values, the sample follows [Expedia's evalu * 1 - The user clicked through to see more information on this hotel. * 2 - The user purchased\booked a room at this hotel. -Expedia's dataset includes both **Click_Bool** and **Booking_Bool** columns that indicate whether the user has clicked or purchased/booked a hotel. Applying the above guidelines to these columns, we create a new "Label" column that contains values {0, 1, 2} for each hotel search result. +Expedia's dataset includes both **Click_Bool** and **Booking_Bool** columns that indicate whether the user has clicked or purchased/booked a hotel. Applying the above guidelines to these columns, we create a new **Label** column that contains values {0, 1, 2} for each hotel search result. The code for labeling the data is similar to the following: @@ -111,13 +111,13 @@ using (var fileStream = File.Create(testDatasetPath)) ````` ### 3. Train Model -This sample trains the model using the LightGbmRankingTrainer which relies on the LightGbm Lambdarank algorithm. The model requires the following inputs: +This sample trains the model using the LightGbmRankingTrainer which relies on the LightGbm Lambdarank algorithm. The model requires the following input columns: -* Group Id - Data instances are contained in logical groupings and each group has an identifier known as the group id. In the case of the Expedia dataset, hotel search results are grouped by their corresponding query where the Group Id corresponds to the query or search id. The input group data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype). -* Label: Ideal rank (e.g. degree of relevance) of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). -* Features: Values that are influential in determining the relevance\rank of a data instance. The input feature data must be a fixed size vector of type [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). +* Group Id - Column that contains the group id for each data instance. Data instances are contained in logical groupings and each group has an identifier known as the group id. In the case of the Expedia dataset, hotel search results are grouped by their corresponding query where the group id corresponds to the query or search id. The input group id data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype). +* Label: Column that contains the deal rank (e.g. degree of relevance) of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). +* Features: The columns that are influential in determining the relevance\rank of a data instance. The input feature data must be a fixed size vector of type [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). -When the trainer is set, custom gains are used to apply weights to each of the rank values used in labels. As described earlier in the sample, the labels contain values {0, 1, 2} which directly correlate to the specified gains {0, 1, 5}. This helps to ensure that the model will place more emphasis on ranking hotel search results labeled with 2 (e.g. signifies the user purchased\booked the hotel) so that they are positioned higher when compared to results labeled with 0 or 1. +When the trainer is set, **custom gains** are used to apply weights to each of the labeled rank values. As described earlier in the sample, the potential label rank values are {0, 1, 2} which directly correlates to the specified gains {0, 1, 5}. This helps to ensure that the model places more emphasis on ranking hotel search results labeled with 2 (e.g. signifies the user purchased\booked the hotel) so that they are positioned higher when compared to results labeled with 0 or 1. The following code is used to train the model: @@ -148,7 +148,7 @@ options.FeatureColumnName = FeaturesVectorName; // Create an Estimator and transform the data: // 1. Concatenate the feature columns into a single Features vector. // 2. Create a key type for the label input data by using the value to key transform. -// 3. Create a key type for the group input data by using a hash transform. +// 3. Create a key type for the group input data by using a hash transform. TODO: Verify that we can't use a key type mapping here??? IEstimator dataPipeline = mlContext.Transforms.Concatenate(FeaturesVectorName, featureCols) .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); @@ -169,14 +169,14 @@ We need this step to conclude how accurate our model is. To do so, the model fro `Evaluate()` compares the predicted values for the test dataset and produces various metrics, such as accuracy, you can explore. Specifically, we can gauge the accuracy of our model using Discounted Cumulative Gain (DCG) and Normalized Discounted Cumulative Gain (NDCG) which are included in the `RankingMetrics` returned by `Evaluate()`. -When evaluating the `RankingMetrics` for this sample's model, you'll notice that the following metrics are reported for DCG and NDCG: +When evaluating the `RankingMetrics` for this sample's model, you'll notice that the following metrics are reported for DCG and NDCG (the values that you see when running the sample will be similar to these): * DCG - @1:1.0191, @2:1.5128, @3:1.8371, @4:2.0922, @5:2.2982, @6:2.4641, @7:2.6051, @8:2.7240, @9:2.8234, @10:2.9133 * NDCG - @1:0.1184, @2:0.1719, @3:0.2082, @4:0.2372, @5:0.2608, @6:0.2798, @7:0.2960, @8:0.3096, @9:0.3210, @10:0.3314 The NDCG values are most useful to examine since this allows us to compare accuracy across different queries. The potential value of NDCG ranges from **0.0** to **1.0**, with 1.0 being a perfect model that exactly matches the ideal ranking. -With this in mind, let's look at our model's values for NDCG. In particular, let's look at the value for **NDCG@10** which is **.3314**. This is the average NDCG for a query returning the top **10** hotel search results. While **.3314** may seem low compared to **1.0**, a more realistic goal is to reach **.5407** which is the score of the first place winner in [Expedia's Personalize Hotel Search contest on Kaggle](https://www.kaggle.com/c/expedia-personalized-sort/leaderboard). To increase the model's accuracy, you would need to continue experimenting with feature engineering improvements. +With this in mind, let's look at our model's values for NDCG. In particular, let's look at the value for **NDCG@10** which is **.3314**. This is the average NDCG for a query returning the top **10** hotel search results. While **.3314** may seem low compared to **1.0**, a more realistic goal is to reach **.5407** which is the score of the first place winner in [Expedia's Personalize Hotel Search contest on Kaggle](https://www.kaggle.com/c/expedia-personalized-sort/leaderboard). To increase the model's accuracy, we would need to experiment with feature engineering to continue to improve our model. Refer to the following code used to test and evaluate the model: @@ -213,4 +213,4 @@ IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where // The individual scores themselves are NOT a useful measure of accuracy; insteady, they are used to determine the ranking where a higher score indicates a higher ranking. ConsoleHelper.PrintScores(firstGroupPredictions); -````` \ No newline at end of file +````` From 996621a6f4b2bf1fd423f1d41e04feb11f201dea Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Tue, 25 Jun 2019 09:58:33 -0500 Subject: [PATCH 04/12] Fixed typos --- .../PersonalizedRanking/Program.cs | 4 ++-- .../getting-started/Ranking_PersonalizedSort/README.md | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs index eeb37b5de..92aee1edc 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -160,7 +160,7 @@ static void EvaluateModel(MLContext mlContext, ITransformer model, string testDa // Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). ConsoleHelper.EvaluateMetrics(mlContext, predictions); - // Evaluate metrics for up to 10 search results (e.g. NDCG@10); + // Evaluate metrics for up to 10 search results (e.g. NDCG@10). ConsoleHelper.EvaluateMetrics(mlContext, predictions, 10); } @@ -182,7 +182,7 @@ public static void ConsumeModel(MLContext mlContext, string modelPath, string ex var firstGroupId = hotelQueries.First().GroupId; IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.PredictedRank).ToList(); - // The individual scores themselves are NOT a useful measure of accuracy; insteady, they are used to determine the ranking where a higher score indicates a higher ranking. + // The individual scores themselves are NOT a useful measure of accuracy; instead, they are used to determine the ranking where a higher score indicates a higher ranking. ConsoleHelper.PrintScores(firstGroupPredictions); } } diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md index 3f75e1b6b..4d5a81ae1 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md @@ -114,8 +114,8 @@ using (var fileStream = File.Create(testDatasetPath)) This sample trains the model using the LightGbmRankingTrainer which relies on the LightGbm Lambdarank algorithm. The model requires the following input columns: * Group Id - Column that contains the group id for each data instance. Data instances are contained in logical groupings and each group has an identifier known as the group id. In the case of the Expedia dataset, hotel search results are grouped by their corresponding query where the group id corresponds to the query or search id. The input group id data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype). -* Label: Column that contains the deal rank (e.g. degree of relevance) of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). -* Features: The columns that are influential in determining the relevance\rank of a data instance. The input feature data must be a fixed size vector of type [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). +* Label - Column that contains the deal rank (e.g. degree of relevance) of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). +* Features - The columns that are influential in determining the relevance\rank of a data instance. The input feature data must be a fixed size vector of type [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). When the trainer is set, **custom gains** are used to apply weights to each of the labeled rank values. As described earlier in the sample, the potential label rank values are {0, 1, 2} which directly correlates to the specified gains {0, 1, 5}. This helps to ensure that the model places more emphasis on ranking hotel search results labeled with 2 (e.g. signifies the user purchased\booked the hotel) so that they are positioned higher when compared to results labeled with 0 or 1. @@ -148,7 +148,7 @@ options.FeatureColumnName = FeaturesVectorName; // Create an Estimator and transform the data: // 1. Concatenate the feature columns into a single Features vector. // 2. Create a key type for the label input data by using the value to key transform. -// 3. Create a key type for the group input data by using a hash transform. TODO: Verify that we can't use a key type mapping here??? +// 3. Create a key type for the group input data by using a hash transform. IEstimator dataPipeline = mlContext.Transforms.Concatenate(FeaturesVectorName, featureCols) .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); @@ -188,7 +188,7 @@ IDataView predictions = model.Transform(testData); // Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). ConsoleHelper.EvaluateMetrics(mlContext, predictions); -// Evaluate metrics for up to 10 search results (e.g. NDCG@10); +// Evaluate metrics for up to 10 search results (e.g. NDCG@10). ConsoleHelper.EvaluateMetrics(mlContext, predictions, 10); ````` @@ -211,6 +211,6 @@ IEnumerable hotelQueries = mlContext.Data.CreateEnumerable().GroupId; IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.PredictedRank).ToList(); -// The individual scores themselves are NOT a useful measure of accuracy; insteady, they are used to determine the ranking where a higher score indicates a higher ranking. +// The individual scores themselves are NOT a useful measure of accuracy; instead, they are used to determine the ranking where a higher score indicates a higher ranking. ConsoleHelper.PrintScores(firstGroupPredictions); ````` From abe226fc1f4ac80ff2bdf84545c94aea2d3c7e56 Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Tue, 25 Jun 2019 10:13:27 -0500 Subject: [PATCH 05/12] Modified RankingMetric code --- .../getting-started/Ranking_PersonalizedSort/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md index 4d5a81ae1..dd5a0f8f7 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md @@ -185,11 +185,10 @@ Refer to the following code used to test and evaluate the model: IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: ',', hasHeader: true); IDataView predictions = model.Transform(testData); -// Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). -ConsoleHelper.EvaluateMetrics(mlContext, predictions); +[...] -// Evaluate metrics for up to 10 search results (e.g. NDCG@10). -ConsoleHelper.EvaluateMetrics(mlContext, predictions, 10); +// Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). +RankingMetrics metrics = mlContext.Ranking.Evaluate(scoredData); ````` ### 5. Consume Model From bffd76cd275cc6e98df0a5c92da1ff1eafa08d01 Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Wed, 26 Jun 2019 13:55:00 -0500 Subject: [PATCH 06/12] Incorporated Justin's feedback --- .../Common/ConsoleHelper.cs | 2 +- .../DataStructures/HotelPrediction.cs | 5 +- .../PersonalizedRanking/Program.cs | 15 +++--- .../Ranking_PersonalizedSort/README.md | 51 ++++++++++--------- 4 files changed, 38 insertions(+), 35 deletions(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs index 812ce1184..150c2ddfb 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs @@ -55,7 +55,7 @@ public static void PrintScores(IEnumerable predictions) { foreach (var prediction in predictions) { - Console.WriteLine($"GroupId: {prediction.GroupId}, Score: {prediction.PredictedRank}"); + Console.WriteLine($"GroupId: {prediction.GroupId}, Score: {prediction.Score}"); } } } diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs index 5c9ec7ef3..d1759867f 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs @@ -12,9 +12,8 @@ public class HotelPrediction // The "Label" does not exist in the underlying Expedia dataset and is added in the sample to indicate the ideal rank of a hotel search result. This is 2 if the user purchased\booked a room at this hotel. Or, is 1 if the user clicked through to see more information on this hotel. Otherwise, is 0 if the user neither clicked nor purchased\booked a room at this hotel. public uint Label { get; set; } - // Prediction made by the model that indicates the relative ranking of the hotel search result. - [ColumnName("Score")] - public float PredictedRank { get; set; } + // Prediction made by the model that is used to indicate the relative ranking of the hotel search result. + public float Score { get; set; } // Values that are influential in determining the relevance of a data instance. This is a vector that contains concatenated columns from the underlying Expedia dataset. public float[] Features { get; set; } diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs index 92aee1edc..9b0e89e4e 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -73,9 +73,9 @@ static void PrepDatasets(MLContext mlContext, string assetPath, string originalD Console.WriteLine("===== Split the data into testing/training datasets ====="); // When splitting the data, 20% is held for the test dataset. - // To avoid label leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. + // To avoid data leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. // This ensures that if two or more hotel instances share the same GroupId, that they are guaranteed to appear in the same subset of data (train or test). - TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: .2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); + TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: 0.2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); IDataView trainData = trainTestData.TrainSet; IDataView testData = trainTestData.TestSet; @@ -117,7 +117,7 @@ static ITransformer TrainModel(MLContext mlContext, string trainDatasetPath, str c == nameof(HotelData.Prop_Review_Score)) .ToArray(); - // Set trainer options. + // Set trainer options. LightGbmRankingTrainer.Options options = new LightGbmRankingTrainer.Options(); options.CustomGains = new int[] { 0, 1, 5 }; options.RowGroupColumnName = nameof(HotelData.GroupId); @@ -132,7 +132,7 @@ static ITransformer TrainModel(MLContext mlContext, string trainDatasetPath, str .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); - // Set the LightGbm Lambdarank trainer. + // Set the LightGBM LambdaRank trainer. IEstimator trainer = mlContext.Ranking.Trainers.LightGbm(options); IEstimator trainerPipeline = dataPipeline.Append(trainer); @@ -151,7 +151,7 @@ static ITransformer TrainModel(MLContext mlContext, string trainDatasetPath, str static void EvaluateModel(MLContext mlContext, ITransformer model, string testDatasetPath) { - Console.WriteLine("===== Evaluate the model's accuracy with test data ====="); + Console.WriteLine("===== Evaluate the model's result quality with test data ====="); // Load the test data and use the model to perform predictions on the test data. IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: ',', hasHeader: true); @@ -180,9 +180,10 @@ public static void ConsumeModel(MLContext mlContext, string modelPath, string ex // In the predictions, get the scores of the hotel search results included in the first query (e.g. group). IEnumerable hotelQueries = mlContext.Data.CreateEnumerable(predictions, reuseRowObject: false); var firstGroupId = hotelQueries.First().GroupId; - IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.PredictedRank).ToList(); + IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.Score).ToList(); - // The individual scores themselves are NOT a useful measure of accuracy; instead, they are used to determine the ranking where a higher score indicates a higher ranking. + // The individual scores themselves are NOT a useful measure of result quality; instead, they are only useful as a relative measure to other scores in the group. + // The scores are used to determine the ranking where a higher score indicates a higher ranking versus another candidate result. ConsoleHelper.PrintScores(firstGroupPredictions); } } diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md index dd5a0f8f7..753bdf060 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md @@ -4,7 +4,7 @@ |----------------|-------------------|-------------------------------|-------------|-----------|---------------------|---------------------------|-----------------------------| | v1.1.0 | Dynamic API | Up-to-date | Console app | .csv file | Ranking hotel search results | Ranking | LightGbm | -This introductory sample shows how to use ML.NET to predict the relevance and order of hotel search results. In the world of machine learning, this type of prediction is known as ranking. +This introductory sample shows how to use ML.NET to predict the the best order to display hotel search result. In the world of machine learning, this type of prediction is known as ranking. ## Problem The ability to perform ranking is a common problem faced by search engines since users expect query results to be ranked\sorted according to their relevance. This problem extends beyond the needs of search engines to include a variety of business scenarios where personalized sorting is key to the user experience. Here are a few specific examples: @@ -14,7 +14,7 @@ The ability to perform ranking is a common problem faced by search engines since Ranking is useful to any scenario where it is important to list items in an order that increases the likelihood of a click, purchase, reservation, etc. -In this sample, we show how to apply ranking to the first example listed above to rank hotel search results according to the likelihood that the hotel will be purchased\booked by the user. To perform ranking, there are two algorithms currently available - FastTree Boosting (FastRank) and Light Gradient Boosting (LightGbm). We use the LightGbm's Lambdarank algorithm in this sample to automatically build an ML model to predict ranking. +In this sample, we show how to apply ranking to the first example listed above to rank hotel search results according to the likelihood that the hotel will be purchased\booked by the user. To perform ranking, there are two algorithms currently available - FastTree Boosting (FastRank) and Light Gradient Boosting Machine (LightGBM). We use the LightGBM's LambdaRank implementation in this sample to automatically build an ML model to predict ranking. ## Dataset The training and testing data used by this sample is based on a public [dataset available at Kaggle](https://www.kaggle.com/c/expedia-personalized-sort) originally provided by Expedia (https://www.expedia.com). @@ -26,26 +26,29 @@ Expedia's datasets consist of hotel search results that are grouped according to * Information on similar competitor hotel offerings. ## ML Task - Ranking -As previously mentioned, this sample uses the LightGbm Lambdarank algorithm which is applied using a supervised learning technique known as "Learning to Rank". This technique requires that train/test datasets contain groups of data instances that are labeled with their ideal ranking value. The label is a numerical\ordinal value, such as {4, 3, 2, 1, 0} or a text value {"Perfect", "Excellent", "Good", "Fair", or "Bad"}. The process for labeling these data instances with their ideal ranking value can be done manually by subject matter experts. Or, the labels can be determined using other metrics, such as the number of clicks on a given search result. This sample uses the latter approach. +As previously mentioned, this sample uses the LightGBM LambdaRank algorithm which is applied using a supervised learning technique known as [**Learning to Rank**](https://en.wikipedia.org/wiki/Learning_to_rank). This technique requires that train/test datasets contain groups of data instances that are each labeled with their relevance scores. The label is a numerical\ordinal value, such as {0, 1, 2, 3, 4} or a text value {"Bad", "Fair", "Good", Excellent", or "Perfect"}. The process for labeling these data instances with their relevance scores can be done manually by subject matter experts. Or, the labels can be determined using other metrics, such as the number of clicks on a given search result. This sample uses the latter +approach. -Once the train/test datasets are labeled with ideal ranking values, the model (e.g. ranker) can then be trained and tested using this data. Through the model training process, the ranker learns how to score each data instance within a group based on their label value. The resulting score of an individual data instance by itself isn't important -- instead, the scores should be compared against one another to determine the relative ordering of a group's data instances. The higher the score a data instance has, the more relevant and more highly ranked it is within its group. +It is expected that the dataset will have many more "Bad" relevance scores than "Perfect". This helps to avoid converting a ranked list directly into equally sized bins of {0, 1, 2, 3, 4}. The relevance scores are also reused so that you will have many items **per group** that are labeled 0, which means the result is "Bad". And, only one or a few labeled 4, which means that the result is "Perfect". + +Once the train/test datasets are labeled with relevance scores, the model (e.g. ranker) can then be trained and tested using this data. Through the model training process, the ranker learns how to score each data instance within a group based on their label value. The resulting score of an individual data instance by itself isn't important -- instead, the scores should be compared against one another to determine the relative ordering of a group's data instances. The higher the score a data instance has, the more relevant and more highly ranked it is within its group. ## Solution The sample performs the following high-level steps to rank Expedia hotel search results: -1. Each hotel search result is **labeled** with its ideal ranking value. +1. Each hotel search result is **labeled** with its relevance score. 2. Once the dataset is labeled, the data is **split** into training and testing datasets. -3. The model is **trained** using the train dataset using LightGbm Lambdarank algorithm. -4. The model is **tested** using the test dataset. This results in a **prediction** that includes a **score** for each hotel instance. The score is used to determine the ranking relative to other hotels within the same query (e.g. group). The predictions are then **evaluated** by examining metrics; specifically the [Discounted Cumulative Gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain). +3. The model is **trained** using the train dataset using LightGBM LambdaRank. +4. The model is **tested** using the test dataset. This results in a **prediction** that includes a **score** for each hotel instance. The score is used to determine the ranking relative to other hotels within the same query (e.g. group). The predictions are then **evaluated** by examining metrics; specifically the [Normalized Discounted Cumulative Gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain)(NDCG). 5. The final step is to **consume** the model to perform ranking predictions for new incoming hotel searches. ### 1. Label Data -To label the data with ideal ranking values, the sample follows [Expedia's evaluation guidance](https://www.kaggle.com/c/expedia-personalized-sort/overview/evaluation): +To label the data with relevance scores, the sample follows [Expedia's evaluation guidance](https://www.kaggle.com/c/expedia-personalized-sort/overview/evaluation): * 0 - The user neither clicked on this hotel nor purchased\booked a room at this hotel. * 1 - The user clicked through to see more information on this hotel. * 2 - The user purchased\booked a room at this hotel. -Expedia's dataset includes both **Click_Bool** and **Booking_Bool** columns that indicate whether the user has clicked or purchased/booked a hotel. Applying the above guidelines to these columns, we create a new **Label** column that contains values {0, 1, 2} for each hotel search result. +Expedia's dataset includes both **Click_Bool** and **Booking_Bool** columns that indicate whether the user has clicked or purchased/booked a hotel. Applying the above guidelines to these columns, we create a new **Label** column that contains values {0, 1, 2} for each hotel search result which maps to the relevance gains {0, 1, 5}. The code for labeling the data is similar to the following: @@ -62,7 +65,7 @@ IDataView labeledData = dataTransformer.Transform(data); [...] -// Custom mapper used to label a hotel search result with the ideal rank. +// Custom mapper used to label a hotel search result with the relevance score. public static Action GetLabelMapper(MLContext mlContext, IDataView data) { Action mapper = (input, output) => @@ -85,15 +88,15 @@ public static Action GetLabelMapper(MLContext mlConte } ````` ### 2. Split Data - With the data properly labeled, it is ready to be split into the train/test datasets. When splitting the data, it's important to make sure that the hotel search results for a given query aren't split across the two datasets. This would cause label leakage where the same query in our training dataset also exists within the testing dataset. + With the data properly labeled, it is ready to be split into the train/test datasets. When splitting the data, it's important to make sure all of the results for a single hotel search remain in the same dataset split. Otherwise, this would cause data leakage where the same query in our training dataset also exists within the testing dataset. The samplingKeyColumnName parameter of TrainTestSplit is used to ensure proper splitting. Refer to the following code which shows how to split the data: ```CSharp // When splitting the data, 20% is held for the test dataset. -// To avoid label leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. +// To avoid data leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. // This ensures that if two or more hotel instances share the same GroupId, that they are guaranteed to appear in the same subset of data (train or test). -TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: .2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); +TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: 0.2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); IDataView trainData = trainTestData.TrainSet; IDataView testData = trainTestData.TestSet; @@ -111,10 +114,10 @@ using (var fileStream = File.Create(testDatasetPath)) ````` ### 3. Train Model -This sample trains the model using the LightGbmRankingTrainer which relies on the LightGbm Lambdarank algorithm. The model requires the following input columns: +This sample trains the model using the LightGbmRankingTrainer which relies on LightGBM LambdaRank. The model requires the following input columns: -* Group Id - Column that contains the group id for each data instance. Data instances are contained in logical groupings and each group has an identifier known as the group id. In the case of the Expedia dataset, hotel search results are grouped by their corresponding query where the group id corresponds to the query or search id. The input group id data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype). -* Label - Column that contains the deal rank (e.g. degree of relevance) of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). +* Group Id - Column that contains the group id for each data instance. Data instances are contained in logical groupings representing all candidate results in a single query and each group has an identifier known as the group id. In the case of the Expedia dataset, hotel search results are grouped by their corresponding query where the group id corresponds to the query or search id. The input group id data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype). +* Label - Column that contains the relevance label of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). * Features - The columns that are influential in determining the relevance\rank of a data instance. The input feature data must be a fixed size vector of type [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). When the trainer is set, **custom gains** are used to apply weights to each of the labeled rank values. As described earlier in the sample, the potential label rank values are {0, 1, 2} which directly correlates to the specified gains {0, 1, 5}. This helps to ensure that the model places more emphasis on ranking hotel search results labeled with 2 (e.g. signifies the user purchased\booked the hotel) so that they are positioned higher when compared to results labeled with 0 or 1. @@ -138,7 +141,7 @@ var featureCols = trainData.Schema.AsQueryable() c == nameof(HotelData.Prop_Review_Score)) .ToArray(); -// Set trainer options. +// Set trainer options. LightGbmRankingTrainer.Options options = new LightGbmRankingTrainer.Options(); options.CustomGains = new int[] { 0, 1, 5 }; options.RowGroupColumnName = nameof(HotelData.GroupId); @@ -153,7 +156,7 @@ IEstimator dataPipeline = mlContext.Transforms.Concatenate(Feature .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); -// Set the LightGbm Lambdarank trainer. +// Set the LightGBM LambdaRank trainer. IEstimator trainer = mlContext.Ranking.Trainers.LightGbm(options); IEstimator trainerPipeline = dataPipeline.Append(trainer); @@ -167,16 +170,16 @@ ITransformer model = trainerPipeline.Fit(trainData); ### 4. Test and Evaluate Model We need this step to conclude how accurate our model is. To do so, the model from the previous step is run against another dataset that was not used in training (e.g. the test dataset). -`Evaluate()` compares the predicted values for the test dataset and produces various metrics, such as accuracy, you can explore. Specifically, we can gauge the accuracy of our model using Discounted Cumulative Gain (DCG) and Normalized Discounted Cumulative Gain (NDCG) which are included in the `RankingMetrics` returned by `Evaluate()`. +`Evaluate()` compares the predicted values for the test dataset and produces various metrics you can explore. Specifically, we can gauge the quality of our model using Discounted Cumulative Gain (DCG) and Normalized Discounted Cumulative Gain (NDCG) which are included in the `RankingMetrics` returned by `Evaluate()`. When evaluating the `RankingMetrics` for this sample's model, you'll notice that the following metrics are reported for DCG and NDCG (the values that you see when running the sample will be similar to these): * DCG - @1:1.0191, @2:1.5128, @3:1.8371, @4:2.0922, @5:2.2982, @6:2.4641, @7:2.6051, @8:2.7240, @9:2.8234, @10:2.9133 * NDCG - @1:0.1184, @2:0.1719, @3:0.2082, @4:0.2372, @5:0.2608, @6:0.2798, @7:0.2960, @8:0.3096, @9:0.3210, @10:0.3314 -The NDCG values are most useful to examine since this allows us to compare accuracy across different queries. The potential value of NDCG ranges from **0.0** to **1.0**, with 1.0 being a perfect model that exactly matches the ideal ranking. +The NDCG values are most useful to examine since this allows us to compare our model's ranking ability across different datasets. The potential value of NDCG ranges from **0.0** to **1.0**, with 1.0 being a perfect model that exactly matches the ideal ranking. -With this in mind, let's look at our model's values for NDCG. In particular, let's look at the value for **NDCG@10** which is **.3314**. This is the average NDCG for a query returning the top **10** hotel search results. While **.3314** may seem low compared to **1.0**, a more realistic goal is to reach **.5407** which is the score of the first place winner in [Expedia's Personalize Hotel Search contest on Kaggle](https://www.kaggle.com/c/expedia-personalized-sort/leaderboard). To increase the model's accuracy, we would need to experiment with feature engineering to continue to improve our model. +With this in mind, let's look at our model's values for NDCG. In particular, let's look at the value for **NDCG@10** which is **0.3314**. This is the average NDCG for a query returning the top **10** hotel search results. While **0.3314** may seem low compared to **1.0**, a more realistic goal is to reach **0.5407** which is the score of the first place winner in [Expedia's Personalize Hotel Search contest on Kaggle](https://www.kaggle.com/c/expedia-personalized-sort/leaderboard). To increase the model's ranking ability, we would need to experiment with feature engineering and model hyperparameters to continue to improve our model. You can refer to the [winning solutions on Kaggle](https://www.kaggle.com/c/expedia-personalized-sort/overview/winners) for ideas on how to do this. Refer to the following code used to test and evaluate the model: @@ -191,7 +194,7 @@ IDataView predictions = model.Transform(testData); RankingMetrics metrics = mlContext.Ranking.Evaluate(scoredData); ````` -### 5. Consume Model +### 5. Consume Model After the model is built and trained, we can use the `Predict()` API to predict the ranking of hotel search results for a user query. @@ -208,8 +211,8 @@ IDataView predictions = predictionPipeline.Transform(exampleData); // In the predictions, get the scores of the hotel search results included in the first query (e.g. group). IEnumerable hotelQueries = mlContext.Data.CreateEnumerable(predictions, reuseRowObject: false); var firstGroupId = hotelQueries.First().GroupId; -IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.PredictedRank).ToList(); +IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.Score).ToList(); -// The individual scores themselves are NOT a useful measure of accuracy; instead, they are used to determine the ranking where a higher score indicates a higher ranking. +// The individual scores themselves are NOT a useful measure of result quality; instead, they are only useful as a relative measure to other scores in the group. The scores are used to determine the ranking where a higher score indicates a higher ranking versus another candidate result. ConsoleHelper.PrintScores(firstGroupPredictions); ````` From e1533366fe304f45f12658834080dfdc7ed6ace7 Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Wed, 26 Jun 2019 14:16:28 -0500 Subject: [PATCH 07/12] Fixed minor inconsistencies --- .../Ranking_PersonalizedSort/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md index 753bdf060..0538c78b4 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md @@ -4,7 +4,7 @@ |----------------|-------------------|-------------------------------|-------------|-----------|---------------------|---------------------------|-----------------------------| | v1.1.0 | Dynamic API | Up-to-date | Console app | .csv file | Ranking hotel search results | Ranking | LightGbm | -This introductory sample shows how to use ML.NET to predict the the best order to display hotel search result. In the world of machine learning, this type of prediction is known as ranking. +This introductory sample shows how to use ML.NET to predict the the best order to display hotel search results. In the world of machine learning, this type of prediction is known as ranking. ## Problem The ability to perform ranking is a common problem faced by search engines since users expect query results to be ranked\sorted according to their relevance. This problem extends beyond the needs of search engines to include a variety of business scenarios where personalized sorting is key to the user experience. Here are a few specific examples: @@ -26,18 +26,18 @@ Expedia's datasets consist of hotel search results that are grouped according to * Information on similar competitor hotel offerings. ## ML Task - Ranking -As previously mentioned, this sample uses the LightGBM LambdaRank algorithm which is applied using a supervised learning technique known as [**Learning to Rank**](https://en.wikipedia.org/wiki/Learning_to_rank). This technique requires that train/test datasets contain groups of data instances that are each labeled with their relevance scores. The label is a numerical\ordinal value, such as {0, 1, 2, 3, 4} or a text value {"Bad", "Fair", "Good", Excellent", or "Perfect"}. The process for labeling these data instances with their relevance scores can be done manually by subject matter experts. Or, the labels can be determined using other metrics, such as the number of clicks on a given search result. This sample uses the latter +As previously mentioned, this sample uses the LightGBM LambdaRank algorithm which is applied using a supervised learning technique known as [**Learning to Rank**](https://en.wikipedia.org/wiki/Learning_to_rank). This technique requires that train\test datasets contain groups of data instances that are each labeled with their relevance score. The label is a numerical\ordinal value, such as {0, 1, 2, 3, 4} or a text value {"Bad", "Fair", "Good", Excellent", or "Perfect"}. The process for labeling these data instances with their relevance scores can be done manually by subject matter experts. Or, the labels can be determined using other metrics, such as the number of clicks on a given search result. This sample uses the latter approach. It is expected that the dataset will have many more "Bad" relevance scores than "Perfect". This helps to avoid converting a ranked list directly into equally sized bins of {0, 1, 2, 3, 4}. The relevance scores are also reused so that you will have many items **per group** that are labeled 0, which means the result is "Bad". And, only one or a few labeled 4, which means that the result is "Perfect". -Once the train/test datasets are labeled with relevance scores, the model (e.g. ranker) can then be trained and tested using this data. Through the model training process, the ranker learns how to score each data instance within a group based on their label value. The resulting score of an individual data instance by itself isn't important -- instead, the scores should be compared against one another to determine the relative ordering of a group's data instances. The higher the score a data instance has, the more relevant and more highly ranked it is within its group. +Once the train\test datasets are labeled with relevance scores, the model (e.g. ranker) can then be trained and tested using this data. Through the model training process, the ranker learns how to score each data instance within a group based on their label value. The resulting score of an individual data instance by itself isn't important -- instead, the scores should be compared against one another to determine the relative ordering of a group's data instances. The higher the score a data instance has, the more relevant and more highly ranked it is within its group. ## Solution The sample performs the following high-level steps to rank Expedia hotel search results: 1. Each hotel search result is **labeled** with its relevance score. 2. Once the dataset is labeled, the data is **split** into training and testing datasets. -3. The model is **trained** using the train dataset using LightGBM LambdaRank. +3. The model is **trained** using the train dataset with LightGBM LambdaRank. 4. The model is **tested** using the test dataset. This results in a **prediction** that includes a **score** for each hotel instance. The score is used to determine the ranking relative to other hotels within the same query (e.g. group). The predictions are then **evaluated** by examining metrics; specifically the [Normalized Discounted Cumulative Gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain)(NDCG). 5. The final step is to **consume** the model to perform ranking predictions for new incoming hotel searches. @@ -48,7 +48,7 @@ To label the data with relevance scores, the sample follows [Expedia's evaluatio * 1 - The user clicked through to see more information on this hotel. * 2 - The user purchased\booked a room at this hotel. -Expedia's dataset includes both **Click_Bool** and **Booking_Bool** columns that indicate whether the user has clicked or purchased/booked a hotel. Applying the above guidelines to these columns, we create a new **Label** column that contains values {0, 1, 2} for each hotel search result which maps to the relevance gains {0, 1, 5}. +Expedia's dataset includes both **Click_Bool** and **Booking_Bool** columns that indicate whether the user has clicked or purchased\booked a hotel. Applying the above guidelines to these columns, we create a new **Label** column that contains values {0, 1, 2} for each hotel search result which maps to the relevance gains {0, 1, 5}. You can find more information on how the relevance gains are used when we train the model later in this sample. The code for labeling the data is similar to the following: @@ -88,7 +88,7 @@ public static Action GetLabelMapper(MLContext mlConte } ````` ### 2. Split Data - With the data properly labeled, it is ready to be split into the train/test datasets. When splitting the data, it's important to make sure all of the results for a single hotel search remain in the same dataset split. Otherwise, this would cause data leakage where the same query in our training dataset also exists within the testing dataset. The samplingKeyColumnName parameter of TrainTestSplit is used to ensure proper splitting. + With the data properly labeled, it is ready to be split into the train/test datasets. When splitting the data, it's important to make sure all of the results for a single hotel search remain in the same dataset split. Otherwise, this would cause data leakage where the same query in our training dataset also exists within the testing dataset. The **samplingKeyColumnName** parameter of **TrainTestSplit** is used to ensure proper splitting. Refer to the following code which shows how to split the data: @@ -120,7 +120,7 @@ This sample trains the model using the LightGbmRankingTrainer which relies on Li * Label - Column that contains the relevance label of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). * Features - The columns that are influential in determining the relevance\rank of a data instance. The input feature data must be a fixed size vector of type [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). -When the trainer is set, **custom gains** are used to apply weights to each of the labeled rank values. As described earlier in the sample, the potential label rank values are {0, 1, 2} which directly correlates to the specified gains {0, 1, 5}. This helps to ensure that the model places more emphasis on ranking hotel search results labeled with 2 (e.g. signifies the user purchased\booked the hotel) so that they are positioned higher when compared to results labeled with 0 or 1. +When the trainer is set, **custom gains** (or relevance gains) are used to apply weights to each of the labeled relevance scores. As described earlier in the sample, the potential relevance scores are {0, 1, 2} which directly correlates to relevance gains {0, 1, 5}. This helps to ensure that the model places more emphasis on ranking hotel search results labeled with 2 (e.g. signifies the user purchased\booked the hotel) so that they are positioned higher when compared to results labeled with 0 or 1. The following code is used to train the model: @@ -168,7 +168,7 @@ ITransformer model = trainerPipeline.Fit(trainData); ````` ### 4. Test and Evaluate Model -We need this step to conclude how accurate our model is. To do so, the model from the previous step is run against another dataset that was not used in training (e.g. the test dataset). +We need this step to determine how effective our model is at ranking. To do so, the model from the previous step is run against another dataset that was not used in training (e.g. the test dataset). `Evaluate()` compares the predicted values for the test dataset and produces various metrics you can explore. Specifically, we can gauge the quality of our model using Discounted Cumulative Gain (DCG) and Normalized Discounted Cumulative Gain (NDCG) which are included in the `RankingMetrics` returned by `Evaluate()`. From addf95af0657a55a9bc47c1f2eb84d20f08733bb Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Fri, 28 Jun 2019 12:21:52 -0500 Subject: [PATCH 08/12] Converted to new dataset --- .../PersonalizedRanking.csproj | 26 - .../Common/ConsoleHelper.cs | 14 +- .../DataStructures/HotelData.cs | 230 -------- .../DataStructures/HotelPrediction.cs | 21 - .../DataStructures/HotelRelevance.cs | 10 - .../DataStructures/SearchResultData.cs | 558 ++++++++++++++++++ .../DataStructures/SearchResultPrediction.cs | 17 + .../PersonalizedRanking/Mapper.cs | 34 -- .../PersonalizedRanking.csproj | 10 +- .../PersonalizedRanking/Program.cs | 136 ++--- .../Ranking_PersonalizedSort/README.md | 176 ++---- 11 files changed, 690 insertions(+), 542 deletions(-) delete mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.csproj delete mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelData.cs delete mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs delete mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelRelevance.cs create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultData.cs create mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultPrediction.cs delete mode 100644 samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Mapper.cs diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.csproj b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.csproj deleted file mode 100644 index 5fd7d123e..000000000 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.csproj +++ /dev/null @@ -1,26 +0,0 @@ - - - - Exe - netcoreapp2.2 - - - - - - - - - - - - - - - - - - - - - diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs index 150c2ddfb..d77fe5a15 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs @@ -9,18 +9,18 @@ namespace PersonalizedRanking.Common { public class ConsoleHelper { - // To evaluate the accuracy of the model's predicted rankings, prints out the Discounted Cumulative Gain and Normalized Discounted Cumulative Gain for hotel search queries. - public static void EvaluateMetrics(MLContext mlContext, IDataView scoredData) + // To evaluate the accuracy of the model's predicted rankings, prints out the Discounted Cumulative Gain and Normalized Discounted Cumulative Gain for search queries. + public static void EvaluateMetrics(MLContext mlContext, IDataView predictions) { // Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). - RankingMetrics metrics = mlContext.Ranking.Evaluate(scoredData); + RankingMetrics metrics = mlContext.Ranking.Evaluate(predictions); Console.WriteLine($"DCG: {string.Join(", ", metrics.DiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}"); - Console.WriteLine($"NDCG: {string.Join(", ", metrics.NormalizedDiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}"); + Console.WriteLine($"NDCG: {string.Join(", ", metrics.NormalizedDiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}\n"); } - // Performs evaluation with the truncation level set up to 10 hotel search results within a query. + // Performs evaluation with the truncation level set up to 10 search results within a query. // This is a temporary workaround for this issue: https://github.com/dotnet/machinelearning/issues/2728. public static void EvaluateMetrics(MLContext mlContext, IDataView scoredData, int truncationLevel) { @@ -47,11 +47,11 @@ public static void EvaluateMetrics(MLContext mlContext, IDataView scoredData, in Console.WriteLine($"DCG: {string.Join(", ", metrics.DiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}"); - Console.WriteLine($"NDCG: {string.Join(", ", metrics.NormalizedDiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}"); + Console.WriteLine($"NDCG: {string.Join(", ", metrics.NormalizedDiscountedCumulativeGains.Select((d, i) => $"@{i + 1}:{d:F4}").ToArray())}\n"); } // Prints out the the individual scores used to determine the relative ranking. - public static void PrintScores(IEnumerable predictions) + public static void PrintScores(IEnumerable predictions) { foreach (var prediction in predictions) { diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelData.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelData.cs deleted file mode 100644 index eedff1aac..000000000 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelData.cs +++ /dev/null @@ -1,230 +0,0 @@ -using Microsoft.ML.Data; -using System; - -namespace PersonalizedRanking.DataStructures -{ - // Representation of the Expedia data set: https://www.kaggle.com/c/expedia-personalized-sort/data. Specifically, this is used for training and testing the model. - public class HotelData - { - // Maps to the "Srch_Id" column; this is the id of the search\query. - [LoadColumn(0)] - public uint GroupId { get; set; } - - // Maps to the "Date_Time" column; this is the date\time of the search. - [LoadColumn(1)] - public DateTime Srch_DateTime { get; set; } - - // Maps to the "Site_Id" column; this is the id of the Expedia point of sale (e.g. Expedia.com, Expedia.co.uk, etc.) - [LoadColumn(2)] - public float Site_Id { get; set; } - - // Maps to the "Visitor_Location_Country_Id" column; this is the id of the country the customer is located. - [LoadColumn(3)] - public float Visitor_Location_Country_Id { get; set; } - - // Mpas to the "Visitor_Hist_Starrating" column; this is the mean star rating of hotels the customer has previously purchased; null signifies there is no purchase history on the customer. - [LoadColumn(4)] - public float Visitor_Hist_Star_Rating { get; set; } - - // Maps to the "Visitor_Hist_Adr_USD" column; this is the mean price per night (in USD) of the hotesl the customer has previously puchases; null signifies there is no purchase history on the customer. - [LoadColumn(5)] - public float Visitor_Hist_Adr_USD { get; set; } - - // Maps to the "Prop_Country_Id" column; this is the id of the country the hotel is located in. - [LoadColumn(6)] - public float Prop_Country_Id { get; set; } - - // Maps to the "Prop_Id" column; this is the id of the hotel. - [LoadColumn(7)] - public float Prop_Id { get; set; } - - // Maps to the Prop_Starrating" column; this is the star rating of the hotel, from 1 to 5 in increments of 1. A 0 indicates the property has no starts, the star rating is not known or cannobe be publicized. - [LoadColumn(8)] - public float Prop_Star_Rating { get; set; } - - // Maps to the "Prop_Review_Score" column; this is the mean customer review score for the hotel on a scale out of 5, rounded to 0.5 increments. A 0 means there have been no reviews, null that the information is not available. - [LoadColumn(9)] - public float Prop_Review_Score { get; set; } - - // Maps to the "Prop_Bran_Bool" column; this has +1 if the hotel is part of a major hotel chain; 0 if it is an independent hotel. - [LoadColumn(10)] - public float Prop_Brand { get; set; } - - // Maps to the "Prop_Location_Score1" column; this is the first score outlining the desirability of a hotel's location. - [LoadColumn(11)] - public float Prop_Loc_Score1 { get; set; } - - // Maps to the "Prop_Location_Score2" column; this is the second score outlining the desirability of a hotel's location. - [LoadColumn(12)] - public float Prop_Loc_Score2 { get; set; } - - // Maps to the "Prop_Log_Historical_Price" column; this is the logarithm of the mean price of the hotel over the last trading period. A 0 will occur if the hotel was not sold in that period. - [LoadColumn(13)] - public float Prop_Log_Historical_Price { get; set; } - - // Maps to the "Position" column; this is the hotel position in Expedia's search results page. - [LoadColumn(14)] - public float Position { get; set; } - - // Maps to the "Price_USD" column; this is the displayed price of the hotel for the given search. Note that different countries have different conventions regarding displaying taxes and fees and the value may be per night or the whole stay. - [LoadColumn(15)] - public float Price_USD { get; set; } - - // Maps to the "Promotion_Flag" column; this has +1 if the hotel had a sale price promotion specifically displayed. - [LoadColumn(16)] - public float Promotion_Flag { get; set; } - - // Maps to the "Srch_Destination_Id" column; this is the id of the destination wher the hotel search was performed. - [LoadColumn(17)] - public float Srch_Destination_ID { get; set; } - - // Maps to the "Srch_Length_Of_Stay" column; this is the number of nights stay that was searched. - [LoadColumn(18)] - public float Srch_Length_Of_Stay { get; set; } - - // Maps to the "Srch_Booking_Window" column; this is the number of days in the future the hotel staty started from the search date. - [LoadColumn(19)] - public float Srch_Booking_Window { get; set; } - - // Maps to the "Srch_Adults_Count" column; this is the number of adults specified in the hotel room. - [LoadColumn(20)] - public float Srch_Adults_Count { get; set; } - - // Maps to the "Srch_Children_Count" column; this is the number of (extra occupancy) children specified in the hotel room. - [LoadColumn(21)] - public float Srch_Children_Count { get; set; } - - // Maps to the "Srch_Room_Count" column; this is the number of hotel rooms specified in the search. - [LoadColumn(22)] - public float Srch_Room_Count { get; set; } - - // Maps to the "Srch_Saturday_Night_Bool" column; this has +1 if the stay includs a Saturday night, starts from Thursday within a length of stay is less than or equal to 4 nights (e.g. weekend) - otherwise 0. - [LoadColumn(23)] - public float Srch_Saturday_Night { get; set; } - - // Maps to the "Srch_Query_Affility_Score" column; this is the log of the probability a hotel will be clicked on in internet searches (hence the values are negative). Null signifies there is no data (e.g. hotel did not register in any searches). - [LoadColumn(24)] - public float Srch_Query_Affinity_Score { get; set; } - - // Maps to the "Orig_Destination_Distance"; this is the physical distance between the hotel and the customer at the time of the search. A null means the distance could not be calculated. - [LoadColumn(25)] - public float Orig_Destination_Distance { get; set; } - - // Maps to the "Random_Bool" column; this is +1 when the displayed sort was random - 0 when the noraml sort order (determined by Expedia's algorithm) was displayed - [LoadColumn(26)] - public float Random_Position { get; set; } - - // Maps to the "Comp1_Rate" column; this is +1 if Expedia has a lwoer price than competitor 1 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 1. Null signifies there is no competitive data. - [LoadColumn(27)] - public float Comp1_Rate { get; set; } - - // Maps to the "Comp1_Inv" column; this is +1 if competitor 1 does not have availability in the hotel. Or, 0 if both Expedia and competitor 1 have availability. Null signifies there is no competitive data. - [LoadColumn(28)] - public float Comp1_Inv { get; set; } - - // Maps to "Comp1_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 1's price (Expedia's price the denominator). Null signifies there is no competitive data. - [LoadColumn(29)] - public float Comp1_Rate_Percent_Diff { get; set; } - - // Maps to the "Comp2_Rate" column; this is +1 if Expedia has a lwoer price than competitor 2 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 2. Null signifies there is no competitive data. - [LoadColumn(30)] - public float Comp2_Rate { get; set; } - - // Maps to the "Comp2_Inv" column; this is +1 if competitor 2 does not have availability in the hotel. Or, 0 if both Expedia and competitor 2 have availability. Null signifies there is no competitive data. - [LoadColumn(31)] - public float Comp2_Inv { get; set; } - - // Maps to "Comp2_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 2's price (Expedia's price the denominator). Null signifies there is no competitive data. - [LoadColumn(32)] - public float Comp2_Rate_Percent_Diff { get; set; } - - // Maps to the "Comp3_Rate" column; this is +1 if Expedia has a lwoer price than competitor 3 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 3. Null signifies there is no competitive data. - [LoadColumn(33)] - public float Comp3_Rate { get; set; } - - // Maps to the "Comp3_Inv" column; this is +1 if competitor 3 does not have availability in the hotel. Or, 0 if both Expedia and competitor 3 have availability. Null signifies there is no competitive data. - [LoadColumn(34)] - public float Comp3_Inv { get; set; } - - // Maps to "Comp3_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 3's price (Expedia's price the denominator). Null signifies there is no competitive data. - [LoadColumn(35)] - public float Comp3_Rate_Percent_Diff { get; set; } - - // Maps to the "Comp4_Rate" column; this is +1 if Expedia has a lwoer price than competitor 4 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 4. Null signifies there is no competitive data. - [LoadColumn(36)] - public float Comp4_Rate { get; set; } - - // Maps to the "Comp4_Inv" column; this is +1 if competitor 4 does not have availability in the hotel. Or, 0 if both Expedia and competitor 4 have availability. Null signifies there is no competitive data. - [LoadColumn(37)] - public float Comp4_Inv { get; set; } - - // Maps to "Comp4_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 4's price (Expedia's price the denominator). Null signifies there is no competitive data. - [LoadColumn(38)] - public float Comp4_Rate_Percent_Diff { get; set; } - - // Maps to the "Comp5_Rate" column; this is +1 if Expedia has a lwoer price than competitor 5 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 5. Null signifies there is no competitive data. - [LoadColumn(39)] - public float Comp5_Rate { get; set; } - - // Maps to the "Comp5_Inv" column; this is +1 if competitor 5 does not have availability in the hotel. Or, 0 if both Expedia and competitor 5 have availability. Null signifies there is no competitive data. - [LoadColumn(40)] - public float Comp5_Inv { get; set; } - - // Maps to "Comp5_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 5's price (Expedia's price the denominator). Null signifies there is no competitive data. - [LoadColumn(41)] - public float Comp5_Rate_Percent_Diff { get; set; } - - // Maps to the "Comp6_Rate" column; this is +1 if Expedia has a lwoer price than competitor 6 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 6. Null signifies there is no competitive data. - [LoadColumn(42)] - public float Comp6_Rate { get; set; } - - // Maps to the "Comp6_Inv" column; this is +1 if competitor 6 does not have availability in the hotel. Or, 0 if both Expedia and competitor 6 have availability. Null signifies there is no competitive data. - [LoadColumn(43)] - public float Comp6_Inv { get; set; } - - // Maps to "Comp6_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 6's price (Expedia's price the denominator). Null signifies there is no competitive data. - [LoadColumn(44)] - public float Comp6_Rate_Percent_Diff { get; set; } - - // Maps to the "Comp7_Rate" column; this is +1 if Expedia has a lwoer price than competitor 7 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 7. Null signifies there is no competitive data. - [LoadColumn(45)] - public float Comp7_Rate { get; set; } - - // Maps to the "Com72_Inv" column; this is +1 if competitor 7 does not have availability in the hotel. Or, 0 if both Expedia and competitor 7 have availability. Null signifies there is no competitive data. - [LoadColumn(46)] - public float Comp7_Inv { get; set; } - - // Maps to "Comp7_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 7's price (Expedia's price the denominator). Null signifies there is no competitive data. - [LoadColumn(47)] - public float Comp7_Rate_Percent_Diff { get; set; } - - // Maps to the "Comp8_Rate" column; this is +1 if Expedia has a lwoer price than competitor 8 for the hotel. Or, 0 if the same. Or, -1 if Expedia's price is higher than competitor 8. Null signifies there is no competitive data. - [LoadColumn(48)] - public float Comp8_Rate { get; set; } - - // Maps to the "Comp8_Inv" column; this is +1 if competitor 8 does not have availability in the hotel. Or, 0 if both Expedia and competitor 8 have availability. Null signifies there is no competitive data. - [LoadColumn(49)] - public float Comp8_Inv { get; set; } - - // Maps to "Comp8_Rate_Percent_Diff" column; this is the absolute percentage difference (if one exists) between Expedia and competitor 8's price (Expedia's price the denominator). Null signifies there is no competitive data. - [LoadColumn(50)] - public float Comp8_Rate_Percent_Diff { get; set; } - - // Maps to the "Click_Bool" column; this is +1 if the user clicked through to see more information on this hotel. - [LoadColumn(51)] - public float Srch_Result_Clicked { get; set; } - - // Maps to the "Gross_Booking_USD" column; this it eh total value of the transaction. This can differ from the price_us due to taxes, fees, conventions on multiple day booking and purchase of a room type otehr than the one shown. - [LoadColumn(52)] - public float Gross_Bookings_USD { get; set; } - - // Maps to the "Booking_Bool" column; this is +1 if the user purchases a room at this hotel. - [LoadColumn(53)] - public float Srch_Result_Booked { get; set; } - - // The "Label" does not exist in the underlying Expedia dataset and is added in the sample to indicate the ideal rank (e.g. predicted value) of a hotel search result. - // This is 2 if the user purchased\booked a room at this hotel. Or, is 1 if the user clicked through to see more information on this hotel. Otherwise, is 0 if the user neither clicked nor purchased\booked a room at this hotel. - [LoadColumn(54)] - public uint Label { get; set; } - } -} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs deleted file mode 100644 index d1759867f..000000000 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelPrediction.cs +++ /dev/null @@ -1,21 +0,0 @@ - -using Microsoft.ML.Data; - -namespace PersonalizedRanking.DataStructures -{ - // Representation of the prediction made by the model (e.g. ranker). - public class HotelPrediction - { - // Maps to the "Srch_Id" column in the underlying Expedia dataset; this is the id of the search\query. - public uint GroupId { get; set; } - - // The "Label" does not exist in the underlying Expedia dataset and is added in the sample to indicate the ideal rank of a hotel search result. This is 2 if the user purchased\booked a room at this hotel. Or, is 1 if the user clicked through to see more information on this hotel. Otherwise, is 0 if the user neither clicked nor purchased\booked a room at this hotel. - public uint Label { get; set; } - - // Prediction made by the model that is used to indicate the relative ranking of the hotel search result. - public float Score { get; set; } - - // Values that are influential in determining the relevance of a data instance. This is a vector that contains concatenated columns from the underlying Expedia dataset. - public float[] Features { get; set; } - } -} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelRelevance.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelRelevance.cs deleted file mode 100644 index c085e970b..000000000 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/HotelRelevance.cs +++ /dev/null @@ -1,10 +0,0 @@ - -namespace PersonalizedRanking.DataStructures -{ - // Used by ML .NET to do a custom mapping to add the "Label" column to the dataset. - public class HotelRelevance - { - // The "Label" does not exist in the underlying Expedia dataset and is added in the sample to indicate the ideal rank of a hotel search result. This is 2 if the user purchased\booked a room at this hotel. Or, is 1 if the user clicked through to see more information on this hotel. Otherwise, is 0 if the user neither clicked nor purchased\booked a room at this hotel. - public uint Label { get; set; } - } -} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultData.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultData.cs new file mode 100644 index 000000000..ff99977cc --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultData.cs @@ -0,0 +1,558 @@ +using Microsoft.ML.Data; + +namespace PersonalizedRanking.DataStructures +{ + public class SearchResultData + { + [ColumnName("Label"), LoadColumn(0)] + public uint Label { get; set; } + + + [ColumnName("GroupId"), LoadColumn(1)] + public uint GroupId { get; set; } + + + [ColumnName("CoveredQueryTermNumberAnchor"), LoadColumn(2)] + public float CoveredQueryTermNumberAnchor { get; set; } + + + [ColumnName("CoveredQueryTermNumberTitle"), LoadColumn(3)] + public float CoveredQueryTermNumberTitle { get; set; } + + + [ColumnName("CoveredQueryTermNumberUrl"), LoadColumn(4)] + public float CoveredQueryTermNumberUrl { get; set; } + + + [ColumnName("CoveredQueryTermNumberWholeDocument"), LoadColumn(5)] + public float CoveredQueryTermNumberWholeDocument { get; set; } + + + [ColumnName("CoveredQueryTermNumberBody"), LoadColumn(6)] + public float CoveredQueryTermNumberBody { get; set; } + + + [ColumnName("CoveredQueryTermRatioAnchor"), LoadColumn(7)] + public float CoveredQueryTermRatioAnchor { get; set; } + + + [ColumnName("CoveredQueryTermRatioTitle"), LoadColumn(8)] + public float CoveredQueryTermRatioTitle { get; set; } + + + [ColumnName("CoveredQueryTermRatioUrl"), LoadColumn(9)] + public float CoveredQueryTermRatioUrl { get; set; } + + + [ColumnName("CoveredQueryTermRatioWholeDocument"), LoadColumn(10)] + public float CoveredQueryTermRatioWholeDocument { get; set; } + + + [ColumnName("CoveredQueryTermRatioBody"), LoadColumn(11)] + public float CoveredQueryTermRatioBody { get; set; } + + + [ColumnName("StreamLengthAnchor"), LoadColumn(12)] + public float StreamLengthAnchor { get; set; } + + + [ColumnName("StreamLengthTitle"), LoadColumn(13)] + public float StreamLengthTitle { get; set; } + + + [ColumnName("StreamLengthUrl"), LoadColumn(14)] + public float StreamLengthUrl { get; set; } + + + [ColumnName("StreamLengthWholeDocument"), LoadColumn(15)] + public float StreamLengthWholeDocument { get; set; } + + + [ColumnName("StreamLengthBody"), LoadColumn(16)] + public float StreamLengthBody { get; set; } + + + [ColumnName("IdfAnchor"), LoadColumn(17)] + public float IdfAnchor { get; set; } + + + [ColumnName("IdfTitle"), LoadColumn(18)] + public float IdfTitle { get; set; } + + + [ColumnName("IdfUrl"), LoadColumn(19)] + public float IdfUrl { get; set; } + + + [ColumnName("IdfWholeDocument"), LoadColumn(20)] + public float IdfWholeDocument { get; set; } + + + [ColumnName("IdfBody"), LoadColumn(21)] + public float IdfBody { get; set; } + + + [ColumnName("SumTfAnchor"), LoadColumn(22)] + public float SumTfAnchor { get; set; } + + + [ColumnName("SumTfTitle"), LoadColumn(23)] + public float SumTfTitle { get; set; } + + + [ColumnName("SumTfUrl"), LoadColumn(24)] + public float SumTfUrl { get; set; } + + + [ColumnName("SumTfWholeDocument"), LoadColumn(25)] + public float SumTfWholeDocument { get; set; } + + + [ColumnName("SumTfBody"), LoadColumn(26)] + public float SumTfBody { get; set; } + + + [ColumnName("MinTfAnchor"), LoadColumn(27)] + public float MinTfAnchor { get; set; } + + + [ColumnName("MinTfTitle"), LoadColumn(28)] + public float MinTfTitle { get; set; } + + + [ColumnName("MinTfUrl"), LoadColumn(29)] + public float MinTfUrl { get; set; } + + + [ColumnName("MinTfWholeDocument"), LoadColumn(30)] + public float MinTfWholeDocument { get; set; } + + + [ColumnName("MinTfBody"), LoadColumn(31)] + public float MinTfBody { get; set; } + + + [ColumnName("MaxTfAnchor"), LoadColumn(32)] + public float MaxTfAnchor { get; set; } + + + [ColumnName("MaxTfTitle"), LoadColumn(33)] + public float MaxTfTitle { get; set; } + + + [ColumnName("MaxTfUrl"), LoadColumn(34)] + public float MaxTfUrl { get; set; } + + + [ColumnName("MaxTfWholeDocument"), LoadColumn(35)] + public float MaxTfWholeDocument { get; set; } + + + [ColumnName("MaxTfBody"), LoadColumn(36)] + public float MaxTfBody { get; set; } + + + [ColumnName("MeanTfAnchor"), LoadColumn(37)] + public float MeanTfAnchor { get; set; } + + + [ColumnName("MeanTfTitle"), LoadColumn(38)] + public float MeanTfTitle { get; set; } + + + [ColumnName("MeanTfUrl"), LoadColumn(39)] + public float MeanTfUrl { get; set; } + + + [ColumnName("MeanTfWholeDocument"), LoadColumn(40)] + public float MeanTfWholeDocument { get; set; } + + + [ColumnName("MeanTfBody"), LoadColumn(41)] + public float MeanTfBody { get; set; } + + + [ColumnName("VarianceTfAnchor"), LoadColumn(42)] + public float VarianceTfAnchor { get; set; } + + + [ColumnName("VarianceTfTitle"), LoadColumn(43)] + public float VarianceTfTitle { get; set; } + + + [ColumnName("VarianceTfUrl"), LoadColumn(44)] + public float VarianceTfUrl { get; set; } + + + [ColumnName("VarianceTfWholeDocument"), LoadColumn(45)] + public float VarianceTfWholeDocument { get; set; } + + + [ColumnName("VarianceTfBody"), LoadColumn(46)] + public float VarianceTfBody { get; set; } + + + [ColumnName("SumStreamLengthNormalizedTfAnchor"), LoadColumn(47)] + public float SumStreamLengthNormalizedTfAnchor { get; set; } + + + [ColumnName("SumStreamLengthNormalizedTfTitle"), LoadColumn(48)] + public float SumStreamLengthNormalizedTfTitle { get; set; } + + + [ColumnName("SumStreamLengthNormalizedTfUrl"), LoadColumn(49)] + public float SumStreamLengthNormalizedTfUrl { get; set; } + + + [ColumnName("SumStreamLengthNormalizedTfWholeDocument"), LoadColumn(50)] + public float SumStreamLengthNormalizedTfWholeDocument { get; set; } + + + [ColumnName("SumStreamLengthNormalizedTfBody"), LoadColumn(51)] + public float SumStreamLengthNormalizedTfBody { get; set; } + + + [ColumnName("MinStreamLengthNormalizedTfAnchor"), LoadColumn(52)] + public float MinStreamLengthNormalizedTfAnchor { get; set; } + + + [ColumnName("MinStreamLengthNormalizedTfTitle"), LoadColumn(53)] + public float MinStreamLengthNormalizedTfTitle { get; set; } + + + [ColumnName("MinStreamLengthNormalizedTfUrl"), LoadColumn(54)] + public float MinStreamLengthNormalizedTfUrl { get; set; } + + + [ColumnName("MinStreamLengthNormalizedTfWholeDocument"), LoadColumn(55)] + public float MinStreamLengthNormalizedTfWholeDocument { get; set; } + + + [ColumnName("MinStreamLengthNormalizedTfBody"), LoadColumn(56)] + public float MinStreamLengthNormalizedTfBody { get; set; } + + + [ColumnName("MaxStreamLengthNormalizedTfAnchor"), LoadColumn(57)] + public float MaxStreamLengthNormalizedTfAnchor { get; set; } + + + [ColumnName("MaxStreamLengthNormalizedTfTitle"), LoadColumn(58)] + public float MaxStreamLengthNormalizedTfTitle { get; set; } + + + [ColumnName("MaxStreamLengthNormalizedTfUrl"), LoadColumn(59)] + public float MaxStreamLengthNormalizedTfUrl { get; set; } + + + [ColumnName("MaxStreamLengthNormalizedTfWholeDocument"), LoadColumn(60)] + public float MaxStreamLengthNormalizedTfWholeDocument { get; set; } + + + [ColumnName("MaxStreamLengthNormalizedTfBody"), LoadColumn(61)] + public float MaxStreamLengthNormalizedTfBody { get; set; } + + + [ColumnName("MeanStreamLengthNormalizedTfAnchor"), LoadColumn(62)] + public float MeanStreamLengthNormalizedTfAnchor { get; set; } + + + [ColumnName("MeanStreamLengthNormalizedTfTitle"), LoadColumn(63)] + public float MeanStreamLengthNormalizedTfTitle { get; set; } + + + [ColumnName("MeanStreamLengthNormalizedTfUrl"), LoadColumn(64)] + public float MeanStreamLengthNormalizedTfUrl { get; set; } + + + [ColumnName("MeanStreamLengthNormalizedTfWholeDocument"), LoadColumn(65)] + public float MeanStreamLengthNormalizedTfWholeDocument { get; set; } + + + [ColumnName("MeanStreamLengthNormalizedTfBody"), LoadColumn(66)] + public float MeanStreamLengthNormalizedTfBody { get; set; } + + + [ColumnName("VarianceStreamLengthNormalizedTfAnchor"), LoadColumn(67)] + public float VarianceStreamLengthNormalizedTfAnchor { get; set; } + + + [ColumnName("VarianceStreamLengthNormalizedTfTitle"), LoadColumn(68)] + public float VarianceStreamLengthNormalizedTfTitle { get; set; } + + + [ColumnName("VarianceStreamLengthNormalizedTfUrl"), LoadColumn(69)] + public float VarianceStreamLengthNormalizedTfUrl { get; set; } + + + [ColumnName("VarianceStreamLengthNormalizedTfWholeDocument"), LoadColumn(70)] + public float VarianceStreamLengthNormalizedTfWholeDocument { get; set; } + + + [ColumnName("VarianceStreamLengthNormalizedTfBody"), LoadColumn(71)] + public float VarianceStreamLengthNormalizedTfBody { get; set; } + + + [ColumnName("SumTfidfAnchor"), LoadColumn(72)] + public float SumTfidfAnchor { get; set; } + + + [ColumnName("SumTfidfTitle"), LoadColumn(73)] + public float SumTfidfTitle { get; set; } + + + [ColumnName("SumTfidfUrl"), LoadColumn(74)] + public float SumTfidfUrl { get; set; } + + + [ColumnName("SumTfidfWholeDocument"), LoadColumn(75)] + public float SumTfidfWholeDocument { get; set; } + + + [ColumnName("SumTfidfBody"), LoadColumn(76)] + public float SumTfidfBody { get; set; } + + + [ColumnName("MinTfidfAnchor"), LoadColumn(77)] + public float MinTfidfAnchor { get; set; } + + + [ColumnName("MinTfidfTitle"), LoadColumn(78)] + public float MinTfidfTitle { get; set; } + + + [ColumnName("MinTfidfUrl"), LoadColumn(79)] + public float MinTfidfUrl { get; set; } + + + [ColumnName("MinTfidfWholeDocument"), LoadColumn(80)] + public float MinTfidfWholeDocument { get; set; } + + + [ColumnName("MinTfidfBody"), LoadColumn(81)] + public float MinTfidfBody { get; set; } + + + [ColumnName("MaxTfidfAnchor"), LoadColumn(82)] + public float MaxTfidfAnchor { get; set; } + + + [ColumnName("MaxTfidfTitle"), LoadColumn(83)] + public float MaxTfidfTitle { get; set; } + + + [ColumnName("MaxTfidfUrl"), LoadColumn(84)] + public float MaxTfidfUrl { get; set; } + + + [ColumnName("MaxTfidfWholeDocument"), LoadColumn(85)] + public float MaxTfidfWholeDocument { get; set; } + + + [ColumnName("MaxTfidfBody"), LoadColumn(86)] + public float MaxTfidfBody { get; set; } + + + [ColumnName("MeanTfidfAnchor"), LoadColumn(87)] + public float MeanTfidfAnchor { get; set; } + + + [ColumnName("MeanTfidfTitle"), LoadColumn(88)] + public float MeanTfidfTitle { get; set; } + + + [ColumnName("MeanTfidfUrl"), LoadColumn(89)] + public float MeanTfidfUrl { get; set; } + + + [ColumnName("MeanTfidfWholeDocument"), LoadColumn(90)] + public float MeanTfidfWholeDocument { get; set; } + + + [ColumnName("MeanTfidfBody"), LoadColumn(91)] + public float MeanTfidfBody { get; set; } + + + [ColumnName("VarianceTfidfAnchor"), LoadColumn(92)] + public float VarianceTfidfAnchor { get; set; } + + + [ColumnName("VarianceTfidfTitle"), LoadColumn(93)] + public float VarianceTfidfTitle { get; set; } + + + [ColumnName("VarianceTfidfUrl"), LoadColumn(94)] + public float VarianceTfidfUrl { get; set; } + + + [ColumnName("VarianceTfidfWholeDocument"), LoadColumn(95)] + public float VarianceTfidfWholeDocument { get; set; } + + + [ColumnName("VarianceTfidfBody"), LoadColumn(96)] + public float VarianceTfidfBody { get; set; } + + + [ColumnName("BooleanModelAnchor"), LoadColumn(97)] + public float BooleanModelAnchor { get; set; } + + + [ColumnName("BooleanModelTitle"), LoadColumn(98)] + public float BooleanModelTitle { get; set; } + + + [ColumnName("BooleanModelUrl"), LoadColumn(99)] + public float BooleanModelUrl { get; set; } + + + [ColumnName("BooleanModelWholeDocument"), LoadColumn(100)] + public float BooleanModelWholeDocument { get; set; } + + + [ColumnName("BooleanModelBody"), LoadColumn(101)] + public float BooleanModelBody { get; set; } + + + [ColumnName("VectorSpaceModelAnchor"), LoadColumn(102)] + public float VectorSpaceModelAnchor { get; set; } + + + [ColumnName("VectorSpaceModelTitle"), LoadColumn(103)] + public float VectorSpaceModelTitle { get; set; } + + + [ColumnName("VectorSpaceModelUrl"), LoadColumn(104)] + public float VectorSpaceModelUrl { get; set; } + + + [ColumnName("VectorSpaceModelWholeDocument"), LoadColumn(105)] + public float VectorSpaceModelWholeDocument { get; set; } + + + [ColumnName("VectorSpaceModelBody"), LoadColumn(106)] + public float VectorSpaceModelBody { get; set; } + + + [ColumnName("Bm25Anchor"), LoadColumn(107)] + public float Bm25Anchor { get; set; } + + + [ColumnName("Bm25Title"), LoadColumn(108)] + public float Bm25Title { get; set; } + + + [ColumnName("Bm25Url"), LoadColumn(109)] + public float Bm25Url { get; set; } + + + [ColumnName("Bm25WholeDocument"), LoadColumn(110)] + public float Bm25WholeDocument { get; set; } + + + [ColumnName("Bm25Body"), LoadColumn(111)] + public float Bm25Body { get; set; } + + + [ColumnName("LmirAbsAnchor"), LoadColumn(112)] + public float LmirAbsAnchor { get; set; } + + + [ColumnName("LmirAbsTitle"), LoadColumn(113)] + public float LmirAbsTitle { get; set; } + + + [ColumnName("LmirAbsUrl"), LoadColumn(114)] + public float LmirAbsUrl { get; set; } + + + [ColumnName("LmirAbsWholeDocument"), LoadColumn(115)] + public float LmirAbsWholeDocument { get; set; } + + + [ColumnName("LmirAbsBody"), LoadColumn(116)] + public float LmirAbsBody { get; set; } + + + [ColumnName("LmirDirAnchor"), LoadColumn(117)] + public float LmirDirAnchor { get; set; } + + + [ColumnName("LmirDirTitle"), LoadColumn(118)] + public float LmirDirTitle { get; set; } + + + [ColumnName("LmirDirUrl"), LoadColumn(119)] + public float LmirDirUrl { get; set; } + + + [ColumnName("LmirDirWholeDocument"), LoadColumn(120)] + public float LmirDirWholeDocument { get; set; } + + + [ColumnName("LmirDirBody"), LoadColumn(121)] + public float LmirDirBody { get; set; } + + + [ColumnName("LmirJmAnchor"), LoadColumn(122)] + public float LmirJmAnchor { get; set; } + + + [ColumnName("LmirJmTitle"), LoadColumn(123)] + public float LmirJmTitle { get; set; } + + + [ColumnName("LmirJmUrl"), LoadColumn(124)] + public float LmirJmUrl { get; set; } + + + [ColumnName("LmirJmWholeDocument"), LoadColumn(125)] + public float LmirJmWholeDocument { get; set; } + + + [ColumnName("LmirJm"), LoadColumn(126)] + public float LmirJm { get; set; } + + + [ColumnName("NumberSlashInUrl"), LoadColumn(127)] + public float NumberSlashInUrl { get; set; } + + + [ColumnName("LengthUrl"), LoadColumn(128)] + public float LengthUrl { get; set; } + + + [ColumnName("InlinkNumber"), LoadColumn(129)] + public float InlinkNumber { get; set; } + + + [ColumnName("OutlinkNumber"), LoadColumn(130)] + public float OutlinkNumber { get; set; } + + + [ColumnName("PageRank"), LoadColumn(131)] + public float PageRank { get; set; } + + + [ColumnName("SiteRank"), LoadColumn(132)] + public float SiteRank { get; set; } + + + [ColumnName("QualityScore"), LoadColumn(133)] + public float QualityScore { get; set; } + + + [ColumnName("QualityScore2"), LoadColumn(134)] + public float QualityScore2 { get; set; } + + + [ColumnName("QueryUrlClickCount"), LoadColumn(135)] + public float QueryUrlClickCount { get; set; } + + + [ColumnName("UrlClickCount"), LoadColumn(136)] + public float UrlClickCount { get; set; } + + + [ColumnName("UrlDwellTime"), LoadColumn(137)] + public float UrlDwellTime { get; set; } + } +} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultPrediction.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultPrediction.cs new file mode 100644 index 000000000..efeec66d6 --- /dev/null +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultPrediction.cs @@ -0,0 +1,17 @@ + +namespace PersonalizedRanking.DataStructures +{ + // Representation of the prediction made by the model (e.g. ranker). + public class SearchResultPrediction + { + public uint GroupId { get; set; } + + public uint Label { get; set; } + + // Prediction made by the model that is used to indicate the relative ranking of the benchmark data instances. + public float Score { get; set; } + + // Values that are influential in determining the relevance of a data instance. This is a vector that contains concatenated columns from the underlying dataset. + public float[] Features { get; set; } + } +} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Mapper.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Mapper.cs deleted file mode 100644 index 479e2090b..000000000 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Mapper.cs +++ /dev/null @@ -1,34 +0,0 @@ -using Microsoft.ML; -using PersonalizedRanking.DataStructures; -using System; -using System.Collections.Generic; -using System.Linq; - -namespace PersonalizedRanking -{ - public class Mapper - { - // Custom mapper used to label a hotel search result with the ideal rank. - // This is based on guidelines provided by Expedia: https://www.kaggle.com/c/expedia-personalized-sort/overview/evaluation. - public static Action GetLabelMapper(MLContext mlContext, IDataView data) - { - Action mapper = (input, output) => - { - if (input.Srch_Result_Booked == 1) - { - output.Label = 2; - } - else if (input.Srch_Result_Clicked == 1) - { - output.Label = 1; - } - else - { - output.Label = 0; - } - }; - - return mapper; - } - } -} diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj index 90bd2688b..5a7b44bfe 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj @@ -1,10 +1,18 @@ - + Exe netcoreapp2.2 + + + + + + + + diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs index 9b0e89e4e..170b0ffae 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.IO.Compression; using System.Linq; using System.Net; using static Microsoft.ML.DataOperationsCatalog; @@ -14,12 +15,12 @@ namespace PersonalizedRanking class Program { const string AssetsPath = @"../../../Assets"; - readonly static string TrainDatasetPath = Path.Combine(AssetsPath, "InputData_Train.csv"); - readonly static string TestDatasetPath = Path.Combine(AssetsPath, "InputData_Test.csv"); - readonly static string ModelPath = Path.Combine(AssetsPath, "RankingModel.csv"); - - readonly static string OriginalDatasetPath = Path.Combine(AssetsPath, "Train.csv"); - readonly static string OriginalExampleDatasetPath = Path.Combine(AssetsPath, "Test.csv"); + readonly static string InputPath = Path.Combine(AssetsPath, "Input"); + readonly static string OutputPath = Path.Combine(AssetsPath, "Output"); + readonly static string DatasetZipPath = Path.Combine(InputPath, "SearchResultDatasets.zip"); + readonly static string TrainDatasetPath = Path.Combine(InputPath, "MSLRWeb10KTrain720kRows.tsv"); + readonly static string TestDatasetPath = Path.Combine(InputPath, "MSLRWeb10KTest240kRows.tsv"); + readonly static string ModelPath = Path.Combine(OutputPath, "RankingModel.zip"); static void Main(string[] args) { @@ -29,13 +30,13 @@ static void Main(string[] args) try { - PrepDatasets(mlContext, AssetsPath, OriginalDatasetPath, TrainDatasetPath, TestDatasetPath); + PrepareData(InputPath, OutputPath, DatasetZipPath, TrainDatasetPath, TestDatasetPath); var model = TrainModel(mlContext, TrainDatasetPath, ModelPath); EvaluateModel(mlContext, model, TestDatasetPath); - ConsumeModel(mlContext, ModelPath, OriginalExampleDatasetPath); + ConsumeModel(mlContext, ModelPath, TestDatasetPath); } catch (Exception e) { @@ -45,142 +46,103 @@ static void Main(string[] args) Console.ReadLine(); } - static void PrepDatasets(MLContext mlContext, string assetPath, string originalDatasetPath, string trainDatasetPath, string testDatasetPath) + static void PrepareData(string inputPath, string outputPath, string datasetZipPath, string trainDatasetPath, string testDatasetPath) { - const string DatasetUrl = "https://www.kaggle.com/c/expedia-personalized-sort/download/data.zip"; - - if (!File.Exists(trainDatasetPath) || !File.Exists(testDatasetPath)) - { - if (!File.Exists(originalDatasetPath)) - { - throw new InvalidOperationException($"This samples requires the Expedia dataset. Please ensure that you have downloaded and extracted the contents of the .zip file to the following directory: {assetPath}. The .zip file can be downloaded from here: {DatasetUrl}"); - } - - Console.WriteLine("===== Prepare the testing/training datasets ====="); - - // Load dataset using TextLoader by specifying the type name that holds the data's schema to be mapped with datasets. - IDataView data = mlContext.Data.LoadFromTextFile(originalDatasetPath, separatorChar: ',', hasHeader: true); - - Console.WriteLine("===== Label the dataset with ideal ranking value ====="); - - // Create an Estimator and use a custom mapper to transform label hotel instances to values 0, 1, or 2. - IEstimator dataPipeline = mlContext.Transforms.CustomMapping(Mapper.GetLabelMapper(mlContext, data), null); - - // To transform the data, call the Fit() method. - ITransformer dataTransformer = dataPipeline.Fit(data); - IDataView labeledData = dataTransformer.Transform(data); - - Console.WriteLine("===== Split the data into testing/training datasets ====="); - - // When splitting the data, 20% is held for the test dataset. - // To avoid data leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. - // This ensures that if two or more hotel instances share the same GroupId, that they are guaranteed to appear in the same subset of data (train or test). - TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: 0.2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); - IDataView trainData = trainTestData.TrainSet; - IDataView testData = trainTestData.TestSet; - - Console.WriteLine("===== Save the testing/training datasets ====="); + Console.WriteLine("===== Prepare data =====\n"); - // Save the test dataset to a file to make it faster to load in subsequent runs. - using (var fileStream = File.Create(trainDatasetPath)) - { - mlContext.Data.SaveAsText(trainData, fileStream, separatorChar: ',', headerRow: true, schema: true); - } + if (!File.Exists(trainDatasetPath) || !File.Exists(testDatasetPath)) + { + ZipFile.ExtractToDirectory(datasetZipPath, inputPath); + } - // Save the train dataset to a file to make it faster to load in subsequent runs. - using (var fileStream = File.Create(testDatasetPath)) - { - mlContext.Data.SaveAsText(testData, fileStream, separatorChar: ',', headerRow: true, schema: true); - } - } + if (!Directory.Exists(outputPath)) + { + Directory.CreateDirectory(outputPath); + } } static ITransformer TrainModel(MLContext mlContext, string trainDatasetPath, string modelPath) { const string FeaturesVectorName = "Features"; - Console.WriteLine("===== Load the training dataset ====="); + Console.WriteLine("===== Load the training dataset =====\n"); // Load the training dataset. - IDataView trainData = mlContext.Data.LoadFromTextFile(trainDatasetPath, separatorChar: ',', hasHeader: true); + IDataView trainData = mlContext.Data.LoadFromTextFile(trainDatasetPath, separatorChar: '\t', hasHeader: true); - Console.WriteLine("===== Set up the trainer ====="); + Console.WriteLine("===== Set up the trainer =====\n"); // Specify the columns to include in the feature input data. var featureCols = trainData.Schema.AsQueryable() .Select(s => s.Name) .Where(c => - c == nameof(HotelData.Price_USD) || - c == nameof(HotelData.Promotion_Flag) || - c == nameof(HotelData.Prop_Id) || - c == nameof(HotelData.Prop_Brand) || - c == nameof(HotelData.Prop_Review_Score)) + c != nameof(SearchResultData.Label) && + c != nameof(SearchResultData.GroupId)) .ToArray(); - // Set trainer options. - LightGbmRankingTrainer.Options options = new LightGbmRankingTrainer.Options(); - options.CustomGains = new int[] { 0, 1, 5 }; - options.RowGroupColumnName = nameof(HotelData.GroupId); - options.LabelColumnName = nameof(HotelData.Label); - options.FeatureColumnName = FeaturesVectorName; - // Create an Estimator and transform the data: // 1. Concatenate the feature columns into a single Features vector. // 2. Create a key type for the label input data by using the value to key transform. // 3. Create a key type for the group input data by using a hash transform. IEstimator dataPipeline = mlContext.Transforms.Concatenate(FeaturesVectorName, featureCols) - .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) - .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); + .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(SearchResultData.Label))) + .Append(mlContext.Transforms.Conversion.Hash(nameof(SearchResultData.GroupId), nameof(SearchResultData.GroupId), numberOfBits: 20)); // Set the LightGBM LambdaRank trainer. - IEstimator trainer = mlContext.Ranking.Trainers.LightGbm(options); + IEstimator trainer = mlContext.Ranking.Trainers.LightGbm(labelColumnName: nameof(SearchResultData.Label), featureColumnName: FeaturesVectorName, rowGroupColumnName: nameof(SearchResultData.GroupId)); IEstimator trainerPipeline = dataPipeline.Append(trainer); - Console.WriteLine("===== Train the model ====="); + Console.WriteLine("===== Train the model =====\n"); // Training the model is a process of running the chosen algorithm on the given data. To perform training you need to call the Fit() method. ITransformer model = trainerPipeline.Fit(trainData); - - Console.WriteLine("===== Save the model ====="); + IDataView transformedTrainData = model.Transform(trainData); +; + Console.WriteLine("===== Save the model =====\n"); // Save the model - mlContext.Model.Save(model, trainData.Schema, modelPath); + mlContext.Model.Save(model, null, modelPath); return model; } static void EvaluateModel(MLContext mlContext, ITransformer model, string testDatasetPath) { - Console.WriteLine("===== Evaluate the model's result quality with test data ====="); + Console.WriteLine("===== Evaluate the model's result quality with test data =====\n"); // Load the test data and use the model to perform predictions on the test data. - IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: ',', hasHeader: true); + IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: '\t', hasHeader: false); IDataView predictions = model.Transform(testData); + Console.WriteLine("===== Use metrics for the data using NDCG@3 =====\n"); + // Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). ConsoleHelper.EvaluateMetrics(mlContext, predictions); + Console.WriteLine("===== Use metrics for the data using NDCG@10 =====\n"); + // Evaluate metrics for up to 10 search results (e.g. NDCG@10). ConsoleHelper.EvaluateMetrics(mlContext, predictions, 10); } - public static void ConsumeModel(MLContext mlContext, string modelPath, string exampleDatasetPath) + public static void ConsumeModel(MLContext mlContext, string modelPath, string testDatasetPath) { - Console.WriteLine("===== Consume the model ====="); + Console.WriteLine("===== Consume the model =====\n"); + // Load test data and use the model to perform predictions on it. + IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: '\t', hasHeader: false); + + // Load the model. DataViewSchema predictionPipelineSchema; ITransformer predictionPipeline = mlContext.Model.Load(modelPath, out predictionPipelineSchema); - // Load example data and use the model to perform predictions on it. - IDataView exampleData = mlContext.Data.LoadFromTextFile(exampleDatasetPath, separatorChar: ',', hasHeader: true); - // Predict rankings. - IDataView predictions = predictionPipeline.Transform(exampleData); + IDataView predictions = predictionPipeline.Transform(testData); - // In the predictions, get the scores of the hotel search results included in the first query (e.g. group). - IEnumerable hotelQueries = mlContext.Data.CreateEnumerable(predictions, reuseRowObject: false); - var firstGroupId = hotelQueries.First().GroupId; - IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.Score).ToList(); + // In the predictions, get the scores of the search results included in the first query (e.g. group). + IEnumerable searchQueries = mlContext.Data.CreateEnumerable(predictions, reuseRowObject: false); + var firstGroupId = searchQueries.First().GroupId; + IEnumerable firstGroupPredictions = searchQueries.Take(100).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.Score).ToList(); // The individual scores themselves are NOT a useful measure of result quality; instead, they are only useful as a relative measure to other scores in the group. // The scores are used to determine the ranking where a higher score indicates a higher ranking versus another candidate result. diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md index 0538c78b4..94262f705 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md @@ -1,10 +1,10 @@ -# Rank hotel search results to provide personalized sorting +# Rank search engine results | ML.NET version | API type | Status | App Type | Data type | Scenario | ML Task | Algorithms | |----------------|-------------------|-------------------------------|-------------|-----------|---------------------|---------------------------|-----------------------------| -| v1.1.0 | Dynamic API | Up-to-date | Console app | .csv file | Ranking hotel search results | Ranking | LightGbm | +| v1.1.0 | Dynamic API | Up-to-date | Console app | .csv file | Ranking search engine results | Ranking | LightGBM | -This introductory sample shows how to use ML.NET to predict the the best order to display hotel search results. In the world of machine learning, this type of prediction is known as ranking. +This introductory sample shows how to use ML.NET to predict the the best order to display search engine results. In the world of machine learning, this type of prediction is known as ranking. ## Problem The ability to perform ranking is a common problem faced by search engines since users expect query results to be ranked\sorted according to their relevance. This problem extends beyond the needs of search engines to include a variety of business scenarios where personalized sorting is key to the user experience. Here are a few specific examples: @@ -14,113 +14,44 @@ The ability to perform ranking is a common problem faced by search engines since Ranking is useful to any scenario where it is important to list items in an order that increases the likelihood of a click, purchase, reservation, etc. -In this sample, we show how to apply ranking to the first example listed above to rank hotel search results according to the likelihood that the hotel will be purchased\booked by the user. To perform ranking, there are two algorithms currently available - FastTree Boosting (FastRank) and Light Gradient Boosting Machine (LightGBM). We use the LightGBM's LambdaRank implementation in this sample to automatically build an ML model to predict ranking. +In this sample, we show how to apply ranking to search engine results. To perform ranking, there are two algorithms currently available - FastTree Boosting (FastRank) and Light Gradient Boosting Machine (LightGBM). We use the LightGBM's LambdaRank implementation in this sample to automatically build an ML model to predict ranking. ## Dataset -The training and testing data used by this sample is based on a public [dataset available at Kaggle](https://www.kaggle.com/c/expedia-personalized-sort) originally provided by Expedia (https://www.expedia.com). +The training and testing data used by this sample is based on a public [dataset provided by Microsoft](https://www.microsoft.com/en-us/research/project/mslr/) originally provided Microsoft Bing. -Expedia's datasets consist of hotel search results that are grouped according to a user's query; each hotel result includes the following details: -* Hotel attributes, such as location attractiveness and price. -* User's criteria for searching hotels, such as the number of rooms\children\adults, length of stay, etc. -* User's purchase and browsing history, such as whether they clicked the link of a hotel or purchased\booked it. -* Information on similar competitor hotel offerings. +The following description is provided for this dataset: -## ML Task - Ranking -As previously mentioned, this sample uses the LightGBM LambdaRank algorithm which is applied using a supervised learning technique known as [**Learning to Rank**](https://en.wikipedia.org/wiki/Learning_to_rank). This technique requires that train\test datasets contain groups of data instances that are each labeled with their relevance score. The label is a numerical\ordinal value, such as {0, 1, 2, 3, 4} or a text value {"Bad", "Fair", "Good", Excellent", or "Perfect"}. The process for labeling these data instances with their relevance scores can be done manually by subject matter experts. Or, the labels can be determined using other metrics, such as the number of clicks on a given search result. This sample uses the latter -approach. - -It is expected that the dataset will have many more "Bad" relevance scores than "Perfect". This helps to avoid converting a ranked list directly into equally sized bins of {0, 1, 2, 3, 4}. The relevance scores are also reused so that you will have many items **per group** that are labeled 0, which means the result is "Bad". And, only one or a few labeled 4, which means that the result is "Perfect". - -Once the train\test datasets are labeled with relevance scores, the model (e.g. ranker) can then be trained and tested using this data. Through the model training process, the ranker learns how to score each data instance within a group based on their label value. The resulting score of an individual data instance by itself isn't important -- instead, the scores should be compared against one another to determine the relative ordering of a group's data instances. The higher the score a data instance has, the more relevant and more highly ranked it is within its group. + The datasets are machine learning data, in which queries and urls are represented by IDs. The datasets consist of feature vectors extracted from query-url pairs along with relevance judgment labels: -## Solution -The sample performs the following high-level steps to rank Expedia hotel search results: -1. Each hotel search result is **labeled** with its relevance score. -2. Once the dataset is labeled, the data is **split** into training and testing datasets. -3. The model is **trained** using the train dataset with LightGBM LambdaRank. -4. The model is **tested** using the test dataset. This results in a **prediction** that includes a **score** for each hotel instance. The score is used to determine the ranking relative to other hotels within the same query (e.g. group). The predictions are then **evaluated** by examining metrics; specifically the [Normalized Discounted Cumulative Gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain)(NDCG). -5. The final step is to **consume** the model to perform ranking predictions for new incoming hotel searches. - -### 1. Label Data -To label the data with relevance scores, the sample follows [Expedia's evaluation guidance](https://www.kaggle.com/c/expedia-personalized-sort/overview/evaluation): - -* 0 - The user neither clicked on this hotel nor purchased\booked a room at this hotel. -* 1 - The user clicked through to see more information on this hotel. -* 2 - The user purchased\booked a room at this hotel. + * The relevance judgments are obtained from a retired labeling set of a commercial web search engine (Microsoft Bing), which take 5 values from 0 (irrelevant) to 4 (perfectly relevant). -Expedia's dataset includes both **Click_Bool** and **Booking_Bool** columns that indicate whether the user has clicked or purchased\booked a hotel. Applying the above guidelines to these columns, we create a new **Label** column that contains values {0, 1, 2} for each hotel search result which maps to the relevance gains {0, 1, 5}. You can find more information on how the relevance gains are used when we train the model later in this sample. + * The features are basically extracted by us (e.g. Microsoft), and are those widely used in the research community. -The code for labeling the data is similar to the following: + In the data files, each row corresponds to a query-url pair. The first column is relevance label of the pair, the second column is query id, and the following columns are features. The larger value the relevance label has, the more relevant the query-url pair is. A query-url pair is represented by a 136-dimensional feature vector. -```CSharp -// Load dataset using TextLoader by specifying the type name that holds the data's schema to be mapped with datasets. -IDataView data = mlContext.Data.LoadFromTextFile(originalDatasetPath, separatorChar: ',', hasHeader: true); +## ML Task - Ranking +As previously mentioned, this sample uses the LightGBM LambdaRank algorithm which is applied using a supervised learning technique known as [**Learning to Rank**](https://en.wikipedia.org/wiki/Learning_to_rank). This technique requires that train\test datasets contain groups of data instances that are each labeled with their relevance score (e.g. relevance judgment label). The label is a numerical\ordinal value, such as {0, 1, 2, 3, 4}. The process for labeling these data instances with their relevance scores can be done manually by subject matter experts. Or, the labels can be determined using other metrics, such as the number of clicks on a given search result. -// Create an Estimator and use a custom mapper to transform label hotel instances to values 0, 1, or 2. -IEstimator dataPipeline = mlContext.Transforms.CustomMapping(Mapper.GetLabelMapper(mlContext, data), null); +It is expected that the dataset will have many more "Bad" relevance scores than "Perfect". This helps to avoid converting a ranked list directly into equally sized bins of {0, 1, 2, 3, 4}. The relevance scores are also reused so that you will have many items **per group** that are labeled 0, which means the result is "Bad". And, only one or a few labeled 4, which means that the result is "Perfect". -// To transform the data, call the Fit() method. -ITransformer dataTransformer = dataPipeline.Fit(data); -IDataView labeledData = dataTransformer.Transform(data); +Once the train\test datasets are labeled with relevance scores, the model (e.g. ranker) can then be trained and tested using this data. Through the model training process, the ranker learns how to score each data instance within a group based on their label value. The resulting score of an individual data instance by itself isn't important -- instead, the scores should be compared against one another to determine the relative ordering of a group's data instances. The higher the score a data instance has, the more relevant and more highly ranked it is within its group. -[...] +## Solution +Since this sample's dataset already is already labeled with relevance scores, we can immediately start with training the model. In cases where you start with a dataset that isn't labeled, you will need to go through this process first by having subject matter experts provide relevance scores or by using some other metrics to determine relevance. -// Custom mapper used to label a hotel search result with the relevance score. -public static Action GetLabelMapper(MLContext mlContext, IDataView data) -{ - Action mapper = (input, output) => - { - if (input.Srch_Result_Booked == 1) - { - output.Label = 2; - } - else if (input.Srch_Result_Clicked == 1) - { - output.Label = 1; - } - else - { - output.Label = 0; - } - }; - - return mapper; -} -````` -### 2. Split Data - With the data properly labeled, it is ready to be split into the train/test datasets. When splitting the data, it's important to make sure all of the results for a single hotel search remain in the same dataset split. Otherwise, this would cause data leakage where the same query in our training dataset also exists within the testing dataset. The **samplingKeyColumnName** parameter of **TrainTestSplit** is used to ensure proper splitting. - - Refer to the following code which shows how to split the data: - - ```CSharp -// When splitting the data, 20% is held for the test dataset. -// To avoid data leakage, the GroupId (e.g. search\query id) is specified as the samplingKeyColumnName. -// This ensures that if two or more hotel instances share the same GroupId, that they are guaranteed to appear in the same subset of data (train or test). -TrainTestData trainTestData = mlContext.Data.TrainTestSplit(labeledData, testFraction: 0.2, samplingKeyColumnName: nameof(HotelData.GroupId), seed: 1); -IDataView trainData = trainTestData.TrainSet; -IDataView testData = trainTestData.TestSet; - -// Save the test dataset to a file to make it faster to load in subsequent runs. -using (var fileStream = File.Create(trainDatasetPath)) -{ - mlContext.Data.SaveAsText(trainData, fileStream, separatorChar: ',', headerRow: true, schema: true); -} - -// Save the train dataset to a file to make it faster to load in subsequent runs. -using (var fileStream = File.Create(testDatasetPath)) -{ - mlContext.Data.SaveAsText(testData, fileStream, separatorChar: ',', headerRow: true, schema: true); -} -````` +This sample performs the following high-level steps to rank the search engine results: +1. The model is **trained** using the train dataset with LightGBM LambdaRank. +2. The model is **tested** using the test dataset. This results in a **prediction** that includes a **score** for each search engine result. The score is used to determine the ranking relative to other results within the same query (e.g. group). The predictions are then **evaluated** by examining metrics; specifically the [Normalized Discounted Cumulative Gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain)(NDCG). +3. The final step is to **consume** the model to perform ranking predictions for new incoming searches. -### 3. Train Model +### 1. Train Model This sample trains the model using the LightGbmRankingTrainer which relies on LightGBM LambdaRank. The model requires the following input columns: -* Group Id - Column that contains the group id for each data instance. Data instances are contained in logical groupings representing all candidate results in a single query and each group has an identifier known as the group id. In the case of the Expedia dataset, hotel search results are grouped by their corresponding query where the group id corresponds to the query or search id. The input group id data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype). +* Group Id - Column that contains the group id for each data instance. Data instances are contained in logical groupings representing all candidate results in a single query and each group has an identifier known as the group id. In the case of the search engine dataset, search results are grouped by their corresponding query where the group id corresponds to the query id. The input group id data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype). * Label - Column that contains the relevance label of each data instance where higher values indicate higher relevance. The input label data type must be [key type](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.data.keydataviewtype) or [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). * Features - The columns that are influential in determining the relevance\rank of a data instance. The input feature data must be a fixed size vector of type [Single](https://docs.microsoft.com/en-us/dotnet/api/system.single). -When the trainer is set, **custom gains** (or relevance gains) are used to apply weights to each of the labeled relevance scores. As described earlier in the sample, the potential relevance scores are {0, 1, 2} which directly correlates to relevance gains {0, 1, 5}. This helps to ensure that the model places more emphasis on ranking hotel search results labeled with 2 (e.g. signifies the user purchased\booked the hotel) so that they are positioned higher when compared to results labeled with 0 or 1. +When the trainer is set, **custom gains** (or relevance gains) can also be used to apply weights to each of the labeled relevance scores. This helps to ensure that the model places more emphasis on ranking results higher that have a higher weight. For the purposes of this sample, we use the default provided weights. The following code is used to train the model: @@ -128,91 +59,84 @@ The following code is used to train the model: const string FeaturesVectorName = "Features"; // Load the training dataset. -IDataView trainData = mlContext.Data.LoadFromTextFile(trainDatasetPath, separatorChar: ',', hasHeader: true); +IDataView trainData = mlContext.Data.LoadFromTextFile(trainDatasetPath, separatorChar: '\t', hasHeader: true); // Specify the columns to include in the feature input data. var featureCols = trainData.Schema.AsQueryable() .Select(s => s.Name) .Where(c => - c == nameof(HotelData.Price_USD) || - c == nameof(HotelData.Promotion_Flag) || - c == nameof(HotelData.Prop_Id) || - c == nameof(HotelData.Prop_Brand) || - c == nameof(HotelData.Prop_Review_Score)) + c != nameof(SearchResultData.Label) && + c != nameof(SearchResultData.GroupId)) .ToArray(); -// Set trainer options. -LightGbmRankingTrainer.Options options = new LightGbmRankingTrainer.Options(); -options.CustomGains = new int[] { 0, 1, 5 }; -options.RowGroupColumnName = nameof(HotelData.GroupId); -options.LabelColumnName = nameof(HotelData.Label); -options.FeatureColumnName = FeaturesVectorName; - // Create an Estimator and transform the data: // 1. Concatenate the feature columns into a single Features vector. // 2. Create a key type for the label input data by using the value to key transform. // 3. Create a key type for the group input data by using a hash transform. IEstimator dataPipeline = mlContext.Transforms.Concatenate(FeaturesVectorName, featureCols) - .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(HotelData.Label))) - .Append(mlContext.Transforms.Conversion.Hash(nameof(HotelData.GroupId), nameof(HotelData.GroupId), numberOfBits: 20)); + .Append(mlContext.Transforms.Conversion.MapValueToKey(nameof(SearchResultData.Label))) + .Append(mlContext.Transforms.Conversion.Hash(nameof(SearchResultData.GroupId), nameof(SearchResultData.GroupId), numberOfBits: 20)); // Set the LightGBM LambdaRank trainer. -IEstimator trainer = mlContext.Ranking.Trainers.LightGbm(options); +IEstimator trainer = mlContext.Ranking.Trainers.LightGbm(labelColumnName: nameof(SearchResultData.Label), featureColumnName: FeaturesVectorName, rowGroupColumnName: nameof(SearchResultData.GroupId)); ; IEstimator trainerPipeline = dataPipeline.Append(trainer); // Training the model is a process of running the chosen algorithm on the given data. To perform training you need to call the Fit() method. ITransformer model = trainerPipeline.Fit(trainData); +IDataView transformedTrainData = model.Transform(trainData); // Save the model - mlContext.Model.Save(model, trainData.Schema, modelPath); +mlContext.Model.Save(model, null, modelPath); ````` -### 4. Test and Evaluate Model +### 2. Test and Evaluate Model We need this step to determine how effective our model is at ranking. To do so, the model from the previous step is run against another dataset that was not used in training (e.g. the test dataset). `Evaluate()` compares the predicted values for the test dataset and produces various metrics you can explore. Specifically, we can gauge the quality of our model using Discounted Cumulative Gain (DCG) and Normalized Discounted Cumulative Gain (NDCG) which are included in the `RankingMetrics` returned by `Evaluate()`. When evaluating the `RankingMetrics` for this sample's model, you'll notice that the following metrics are reported for DCG and NDCG (the values that you see when running the sample will be similar to these): -* DCG - @1:1.0191, @2:1.5128, @3:1.8371, @4:2.0922, @5:2.2982, @6:2.4641, @7:2.6051, @8:2.7240, @9:2.8234, @10:2.9133 +* DCG - @1:11.9058, @2:17.4132, @3:21.2908, @4:24.5243, @5:27.3235, @6:29.6794, @7:31.9928, @8:34.0955, @9:36.0850, @10:37.9679 -* NDCG - @1:0.1184, @2:0.1719, @3:0.2082, @4:0.2372, @5:0.2608, @6:0.2798, @7:0.2960, @8:0.3096, @9:0.3210, @10:0.3314 +* NDCG - @1:0.5012, @2:0.4945, @3:0.4986, @4:0.5055, @5:0.5131, @6:0.5182, @7:0.5251, @8:0.5308, @9:0.5365, @10:0.5417 The NDCG values are most useful to examine since this allows us to compare our model's ranking ability across different datasets. The potential value of NDCG ranges from **0.0** to **1.0**, with 1.0 being a perfect model that exactly matches the ideal ranking. -With this in mind, let's look at our model's values for NDCG. In particular, let's look at the value for **NDCG@10** which is **0.3314**. This is the average NDCG for a query returning the top **10** hotel search results. While **0.3314** may seem low compared to **1.0**, a more realistic goal is to reach **0.5407** which is the score of the first place winner in [Expedia's Personalize Hotel Search contest on Kaggle](https://www.kaggle.com/c/expedia-personalized-sort/leaderboard). To increase the model's ranking ability, we would need to experiment with feature engineering and model hyperparameters to continue to improve our model. You can refer to the [winning solutions on Kaggle](https://www.kaggle.com/c/expedia-personalized-sort/overview/winners) for ideas on how to do this. +With this in mind, let's look at our model's values for NDCG. In particular, let's look at the value for **NDCG@10** which is **0.5417**. This is the average NDCG for a query returning the top **10** search engine results. To increase the model's ranking ability, we would need to experiment with feature engineering and model hyperparameters to continue to improve our model. Refer to the following code used to test and evaluate the model: ```CSharp // Load the test data and use the model to perform predictions on the test data. -IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: ',', hasHeader: true); +IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: '\t', hasHeader: false); IDataView predictions = model.Transform(testData); [...] // Evaluate the metrics for the data using NDCG; by default, metrics for the up to 3 search results in the query are reported (e.g. NDCG@3). -RankingMetrics metrics = mlContext.Ranking.Evaluate(scoredData); +RankingMetrics metrics = mlContext.Ranking.Evaluate(predictions); ````` -### 5. Consume Model +### 3. Consume Model -After the model is built and trained, we can use the `Predict()` API to predict the ranking of hotel search results for a user query. +After the model is built and trained, we can use the `Predict()` API to predict the ranking of search engine results for a user query. ```CSharp +// Load test data and use the model to perform predictions on it. +IDataView testData = mlContext.Data.LoadFromTextFile(testDatasetPath, separatorChar: '\t', hasHeader: false); + +// Load the model. DataViewSchema predictionPipelineSchema; ITransformer predictionPipeline = mlContext.Model.Load(modelPath, out predictionPipelineSchema); -// Load example data and use the model to perform predictions on it. -IDataView exampleData = mlContext.Data.LoadFromTextFile(exampleDatasetPath, separatorChar: ',', hasHeader: true); - // Predict rankings. -IDataView predictions = predictionPipeline.Transform(exampleData); +IDataView predictions = predictionPipeline.Transform(testData); -// In the predictions, get the scores of the hotel search results included in the first query (e.g. group). -IEnumerable hotelQueries = mlContext.Data.CreateEnumerable(predictions, reuseRowObject: false); -var firstGroupId = hotelQueries.First().GroupId; -IEnumerable firstGroupPredictions = hotelQueries.Take(50).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.Score).ToList(); +// In the predictions, get the scores of the search results included in the first query (e.g. group). +IEnumerable searchQueries = mlContext.Data.CreateEnumerable(predictions, reuseRowObject: false); +var firstGroupId = searchQueries.First().GroupId; +IEnumerable firstGroupPredictions = searchQueries.Take(100).Where(p => p.GroupId == firstGroupId).OrderByDescending(p => p.Score).ToList(); -// The individual scores themselves are NOT a useful measure of result quality; instead, they are only useful as a relative measure to other scores in the group. The scores are used to determine the ranking where a higher score indicates a higher ranking versus another candidate result. +// The individual scores themselves are NOT a useful measure of result quality; instead, they are only useful as a relative measure to other scores in the group. +// The scores are used to determine the ranking where a higher score indicates a higher ranking versus another candidate result. ConsoleHelper.PrintScores(firstGroupPredictions); ````` From 2c88f38fc010b3edef5f0f6603d4e0260b6f0843 Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Fri, 28 Jun 2019 13:25:22 -0500 Subject: [PATCH 09/12] Changed code to download dataset since its zip is too large --- .../PersonalizedRanking/Program.cs | 40 ++++++++++++++----- samples/csharp/v1.0.0-All-Samples.sln | 20 +++++----- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs index 170b0ffae..a5716a31d 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -1,5 +1,4 @@ using Microsoft.ML; -using Microsoft.ML.Trainers.LightGbm; using PersonalizedRanking.Common; using PersonalizedRanking.DataStructures; using System; @@ -8,16 +7,17 @@ using System.IO.Compression; using System.Linq; using System.Net; -using static Microsoft.ML.DataOperationsCatalog; namespace PersonalizedRanking { class Program { const string AssetsPath = @"../../../Assets"; + const string TrainDatasetUrl = "https://aka.ms/mlnet-resources/benchmarks/MSLRWeb10KTrain720kRows.tsv"; + const string TestDatasetUrl = "https://aka.ms/mlnet-resources/benchmarks/MSLRWeb10KTest240kRows.tsv"; + readonly static string InputPath = Path.Combine(AssetsPath, "Input"); readonly static string OutputPath = Path.Combine(AssetsPath, "Output"); - readonly static string DatasetZipPath = Path.Combine(InputPath, "SearchResultDatasets.zip"); readonly static string TrainDatasetPath = Path.Combine(InputPath, "MSLRWeb10KTrain720kRows.tsv"); readonly static string TestDatasetPath = Path.Combine(InputPath, "MSLRWeb10KTest240kRows.tsv"); readonly static string ModelPath = Path.Combine(OutputPath, "RankingModel.zip"); @@ -30,7 +30,7 @@ static void Main(string[] args) try { - PrepareData(InputPath, OutputPath, DatasetZipPath, TrainDatasetPath, TestDatasetPath); + PrepareData(InputPath, OutputPath, TrainDatasetPath, TrainDatasetUrl, TestDatasetUrl, TestDatasetPath); var model = TrainModel(mlContext, TrainDatasetPath, ModelPath); @@ -46,19 +46,41 @@ static void Main(string[] args) Console.ReadLine(); } - static void PrepareData(string inputPath, string outputPath, string datasetZipPath, string trainDatasetPath, string testDatasetPath) + static void PrepareData(string inputPath, string outputPath, string trainDatasetPath, string trainDatasetUrl, string testDatasetUrl, string testDatasetPath) { Console.WriteLine("===== Prepare data =====\n"); - if (!File.Exists(trainDatasetPath) || !File.Exists(testDatasetPath)) + if (!Directory.Exists(outputPath)) { - ZipFile.ExtractToDirectory(datasetZipPath, inputPath); + Directory.CreateDirectory(outputPath); } - if (!Directory.Exists(outputPath)) + if (!Directory.Exists(inputPath)) { - Directory.CreateDirectory(outputPath); + Directory.CreateDirectory(inputPath); } + + if (!File.Exists(trainDatasetPath)) + { + Console.WriteLine("===== Download the train dataset - this may take several minutes =====\n"); + using (var client = new WebClient()) + { + //To use the datasets, you must read and accept the online agreement. By using the datasets, you agree to be bound by the terms of its license. + client.DownloadFile(trainDatasetUrl, TrainDatasetPath); + } + } + + if (!File.Exists(testDatasetPath)) + { + Console.WriteLine("===== Download the test dataset - this may take several minutes =====\n"); + using (var client = new WebClient()) + { + //To use the datasets, you must read and accept the online agreement. By using the datasets, you agree to be bound by the terms of its license. + client.DownloadFile(testDatasetUrl, testDatasetPath); + } + } + + Console.WriteLine("===== Download is finished =====\n"); } static ITransformer TrainModel(MLContext mlContext, string trainDatasetPath, string modelPath) diff --git a/samples/csharp/v1.0.0-All-Samples.sln b/samples/csharp/v1.0.0-All-Samples.sln index 4c034e397..3488d964c 100644 --- a/samples/csharp/v1.0.0-All-Samples.sln +++ b/samples/csharp/v1.0.0-All-Samples.sln @@ -131,7 +131,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TensorFlowImageClassificati EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PersonalizedRanking.Solution", "PersonalizedRanking.Solution", "{B76DD928-A78E-497C-BA7D-83C5864452F9}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PersonalizedRanking", "getting-started\Ranking_PersonalizedSort\PersonalizedRanking.csproj", "{C5886C5F-539A-4B9D-A03A-9C5B57E77763}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PersonalizedRanking", "getting-started\Ranking_PersonalizedSort\PersonalizedRanking\PersonalizedRanking.csproj", "{97ACFE55-AC1F-41D0-B1A7-A0D97440346A}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -429,14 +429,14 @@ Global {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|Any CPU.Build.0 = Release|Any CPU {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|x64.ActiveCfg = Release|Any CPU {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|x64.Build.0 = Release|Any CPU - {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Debug|Any CPU.Build.0 = Debug|Any CPU - {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Debug|x64.ActiveCfg = Debug|Any CPU - {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Debug|x64.Build.0 = Debug|Any CPU - {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Release|Any CPU.ActiveCfg = Release|Any CPU - {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Release|Any CPU.Build.0 = Release|Any CPU - {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Release|x64.ActiveCfg = Release|Any CPU - {C5886C5F-539A-4B9D-A03A-9C5B57E77763}.Release|x64.Build.0 = Release|Any CPU + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Debug|x64.ActiveCfg = Debug|Any CPU + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Debug|x64.Build.0 = Debug|Any CPU + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Release|Any CPU.Build.0 = Release|Any CPU + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Release|x64.ActiveCfg = Release|Any CPU + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Release|x64.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -478,7 +478,7 @@ Global {EA9E37C6-8C62-4370-A9CF-369D002B89B6} = {7C3A7DA5-CBEB-420F-B7AC-CDE34BE2D52E} {F2C0FCE9-9F76-4318-826E-892441E4A169} = {EF9F8695-25DE-4FE4-894A-6DE24E0BDD73} {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47} = {F59681C2-D829-4538-A41A-568F7A7D07FD} - {C5886C5F-539A-4B9D-A03A-9C5B57E77763} = {B76DD928-A78E-497C-BA7D-83C5864452F9} + {97ACFE55-AC1F-41D0-B1A7-A0D97440346A} = {B76DD928-A78E-497C-BA7D-83C5864452F9} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {98369941-33DD-450C-A410-B9A91C8CDE91} From 71fe0e6954cb0c79a763e53c78f4acb90218205f Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Fri, 28 Jun 2019 13:36:55 -0500 Subject: [PATCH 10/12] fixed using statement --- .../Ranking_PersonalizedSort/PersonalizedRanking/Program.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs index a5716a31d..859567b45 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; using System.IO; -using System.IO.Compression; using System.Linq; using System.Net; From 61b694f0f1a3af4bf6162c6511c8cd8eb63d478b Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Fri, 28 Jun 2019 17:48:30 -0500 Subject: [PATCH 11/12] Removed unneeded license info for dataset --- .../Ranking_PersonalizedSort/PersonalizedRanking/Program.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs index 859567b45..aa8ce48bb 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs @@ -64,7 +64,6 @@ static void PrepareData(string inputPath, string outputPath, string trainDataset Console.WriteLine("===== Download the train dataset - this may take several minutes =====\n"); using (var client = new WebClient()) { - //To use the datasets, you must read and accept the online agreement. By using the datasets, you agree to be bound by the terms of its license. client.DownloadFile(trainDatasetUrl, TrainDatasetPath); } } @@ -74,7 +73,6 @@ static void PrepareData(string inputPath, string outputPath, string trainDataset Console.WriteLine("===== Download the test dataset - this may take several minutes =====\n"); using (var client = new WebClient()) { - //To use the datasets, you must read and accept the online agreement. By using the datasets, you agree to be bound by the terms of its license. client.DownloadFile(testDatasetUrl, testDatasetPath); } } From 5890089e9ff8f779881b8bd63467a7464c6e7ebb Mon Sep 17 00:00:00 2001 From: Nicole Haugen Date: Fri, 28 Jun 2019 19:22:35 -0500 Subject: [PATCH 12/12] Renamed solution and minor changes --- .../README.md | 15 +++++++++++- .../WebRanking.sln} | 10 ++++---- .../WebRanking}/Common/ConsoleHelper.cs | 4 ++-- .../DataStructures/SearchResultData.cs | 2 +- .../DataStructures/SearchResultPrediction.cs | 4 ++-- .../WebRanking}/Program.cs | 24 ++++++++++++++----- .../WebRanking/WebRanking.csproj} | 8 ------- samples/csharp/v1.0.0-All-Samples.sln | 22 ++++++++--------- 8 files changed, 53 insertions(+), 36 deletions(-) rename samples/csharp/getting-started/{Ranking_PersonalizedSort => Ranking_Web}/README.md (95%) rename samples/csharp/getting-started/{Ranking_PersonalizedSort/PersonalizedRanking.sln => Ranking_Web/WebRanking.sln} (62%) rename samples/csharp/getting-started/{Ranking_PersonalizedSort/PersonalizedRanking => Ranking_Web/WebRanking}/Common/ConsoleHelper.cs (97%) rename samples/csharp/getting-started/{Ranking_PersonalizedSort/PersonalizedRanking => Ranking_Web/WebRanking}/DataStructures/SearchResultData.cs (99%) rename samples/csharp/getting-started/{Ranking_PersonalizedSort/PersonalizedRanking => Ranking_Web/WebRanking}/DataStructures/SearchResultPrediction.cs (84%) rename samples/csharp/getting-started/{Ranking_PersonalizedSort/PersonalizedRanking => Ranking_Web/WebRanking}/Program.cs (89%) rename samples/csharp/getting-started/{Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj => Ranking_Web/WebRanking/WebRanking.csproj} (55%) diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md b/samples/csharp/getting-started/Ranking_Web/README.md similarity index 95% rename from samples/csharp/getting-started/Ranking_PersonalizedSort/README.md rename to samples/csharp/getting-started/Ranking_Web/README.md index 94262f705..d77128962 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/README.md +++ b/samples/csharp/getting-started/Ranking_Web/README.md @@ -17,7 +17,20 @@ Ranking is useful to any scenario where it is important to list items in an orde In this sample, we show how to apply ranking to search engine results. To perform ranking, there are two algorithms currently available - FastTree Boosting (FastRank) and Light Gradient Boosting Machine (LightGBM). We use the LightGBM's LambdaRank implementation in this sample to automatically build an ML model to predict ranking. ## Dataset -The training and testing data used by this sample is based on a public [dataset provided by Microsoft](https://www.microsoft.com/en-us/research/project/mslr/) originally provided Microsoft Bing. +The training and testing data used by this sample is based on a public [dataset provided by Microsoft](https://www.microsoft.com/en-us/research/project/mslr/) originally provided Microsoft Bing. The dataset is released under a [CC-by 4.0](https://creativecommons.org/licenses/by/4.0/) license. + +@article{DBLP:journals/corr/QinL13, + author = {Tao Qin and + Tie{-}Yan Liu}, + title = {Introducing {LETOR} 4.0 Datasets}, + journal = {CoRR}, + volume = {abs/1306.2597}, + year = {2013}, + url = {https://arxiv.org/abs/1306.2597}, + timestamp = {Mon, 01 Jul 2013 20:31:25 +0200}, + biburl = {https://dblp.uni-trier.de/rec/bib/journals/corr/QinL13}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} The following description is provided for this dataset: diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.sln b/samples/csharp/getting-started/Ranking_Web/WebRanking.sln similarity index 62% rename from samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.sln rename to samples/csharp/getting-started/Ranking_Web/WebRanking.sln index a63d6b64d..367beec77 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking.sln +++ b/samples/csharp/getting-started/Ranking_Web/WebRanking.sln @@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 15 VisualStudioVersion = 15.0.28307.705 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PersonalizedRanking", "PersonalizedRanking\PersonalizedRanking.csproj", "{F71F24D8-F174-461F-B375-508EFB827A33}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "WebRanking", "WebRanking\WebRanking.csproj", "{D502394E-930B-401A-812F-2A996751B80A}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -11,10 +11,10 @@ Global Release|Any CPU = Release|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {F71F24D8-F174-461F-B375-508EFB827A33}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {F71F24D8-F174-461F-B375-508EFB827A33}.Debug|Any CPU.Build.0 = Debug|Any CPU - {F71F24D8-F174-461F-B375-508EFB827A33}.Release|Any CPU.ActiveCfg = Release|Any CPU - {F71F24D8-F174-461F-B375-508EFB827A33}.Release|Any CPU.Build.0 = Release|Any CPU + {D502394E-930B-401A-812F-2A996751B80A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D502394E-930B-401A-812F-2A996751B80A}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D502394E-930B-401A-812F-2A996751B80A}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D502394E-930B-401A-812F-2A996751B80A}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs b/samples/csharp/getting-started/Ranking_Web/WebRanking/Common/ConsoleHelper.cs similarity index 97% rename from samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs rename to samples/csharp/getting-started/Ranking_Web/WebRanking/Common/ConsoleHelper.cs index d77fe5a15..a262e10be 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Common/ConsoleHelper.cs +++ b/samples/csharp/getting-started/Ranking_Web/WebRanking/Common/ConsoleHelper.cs @@ -1,11 +1,11 @@ using Microsoft.ML; using Microsoft.ML.Data; -using PersonalizedRanking.DataStructures; +using WebRanking.DataStructures; using System; using System.Collections.Generic; using System.Linq; -namespace PersonalizedRanking.Common +namespace WebRanking.Common { public class ConsoleHelper { diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultData.cs b/samples/csharp/getting-started/Ranking_Web/WebRanking/DataStructures/SearchResultData.cs similarity index 99% rename from samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultData.cs rename to samples/csharp/getting-started/Ranking_Web/WebRanking/DataStructures/SearchResultData.cs index ff99977cc..177e77a38 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultData.cs +++ b/samples/csharp/getting-started/Ranking_Web/WebRanking/DataStructures/SearchResultData.cs @@ -1,6 +1,6 @@ using Microsoft.ML.Data; -namespace PersonalizedRanking.DataStructures +namespace WebRanking.DataStructures { public class SearchResultData { diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultPrediction.cs b/samples/csharp/getting-started/Ranking_Web/WebRanking/DataStructures/SearchResultPrediction.cs similarity index 84% rename from samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultPrediction.cs rename to samples/csharp/getting-started/Ranking_Web/WebRanking/DataStructures/SearchResultPrediction.cs index efeec66d6..9b6db3933 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/DataStructures/SearchResultPrediction.cs +++ b/samples/csharp/getting-started/Ranking_Web/WebRanking/DataStructures/SearchResultPrediction.cs @@ -1,5 +1,5 @@  -namespace PersonalizedRanking.DataStructures +namespace WebRanking.DataStructures { // Representation of the prediction made by the model (e.g. ranker). public class SearchResultPrediction @@ -8,7 +8,7 @@ public class SearchResultPrediction public uint Label { get; set; } - // Prediction made by the model that is used to indicate the relative ranking of the benchmark data instances. + // Prediction made by the model that is used to indicate the relative ranking of the candidate search results. public float Score { get; set; } // Values that are influential in determining the relevance of a data instance. This is a vector that contains concatenated columns from the underlying dataset. diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs b/samples/csharp/getting-started/Ranking_Web/WebRanking/Program.cs similarity index 89% rename from samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs rename to samples/csharp/getting-started/Ranking_Web/WebRanking/Program.cs index aa8ce48bb..1a2eaf1cb 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/Program.cs +++ b/samples/csharp/getting-started/Ranking_Web/WebRanking/Program.cs @@ -1,23 +1,25 @@ using Microsoft.ML; -using PersonalizedRanking.Common; -using PersonalizedRanking.DataStructures; +using WebRanking.Common; +using WebRanking.DataStructures; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; -namespace PersonalizedRanking +namespace WebRanking { class Program { const string AssetsPath = @"../../../Assets"; const string TrainDatasetUrl = "https://aka.ms/mlnet-resources/benchmarks/MSLRWeb10KTrain720kRows.tsv"; + const string ValidationDatasetUrl = "https://aka.ms/mlnet-resources/benchmarks/MSLRWeb10KValidate240kRows.tsv"; const string TestDatasetUrl = "https://aka.ms/mlnet-resources/benchmarks/MSLRWeb10KTest240kRows.tsv"; readonly static string InputPath = Path.Combine(AssetsPath, "Input"); readonly static string OutputPath = Path.Combine(AssetsPath, "Output"); readonly static string TrainDatasetPath = Path.Combine(InputPath, "MSLRWeb10KTrain720kRows.tsv"); + readonly static string ValidationDatasetPath = Path.Combine(InputPath, "MSLRWeb10KValidate240kRows.tsv"); readonly static string TestDatasetPath = Path.Combine(InputPath, "MSLRWeb10KTest240kRows.tsv"); readonly static string ModelPath = Path.Combine(OutputPath, "RankingModel.zip"); @@ -29,7 +31,7 @@ static void Main(string[] args) try { - PrepareData(InputPath, OutputPath, TrainDatasetPath, TrainDatasetUrl, TestDatasetUrl, TestDatasetPath); + PrepareData(InputPath, OutputPath, TrainDatasetPath, TrainDatasetUrl, TestDatasetUrl, TestDatasetPath, ValidationDatasetUrl, ValidationDatasetPath); var model = TrainModel(mlContext, TrainDatasetPath, ModelPath); @@ -45,7 +47,8 @@ static void Main(string[] args) Console.ReadLine(); } - static void PrepareData(string inputPath, string outputPath, string trainDatasetPath, string trainDatasetUrl, string testDatasetUrl, string testDatasetPath) + static void PrepareData(string inputPath, string outputPath, string trainDatasetPath, string trainDatasetUrl, + string testDatasetUrl, string testDatasetPath, string validationDatasetUrl, string validationDatasetPath) { Console.WriteLine("===== Prepare data =====\n"); @@ -67,7 +70,16 @@ static void PrepareData(string inputPath, string outputPath, string trainDataset client.DownloadFile(trainDatasetUrl, TrainDatasetPath); } } - + + if (!File.Exists(validationDatasetPath)) + { + Console.WriteLine("===== Download the validation dataset - this may take several minutes =====\n"); + using (var client = new WebClient()) + { + client.DownloadFile(validationDatasetUrl, validationDatasetPath); + } + } + if (!File.Exists(testDatasetPath)) { Console.WriteLine("===== Download the test dataset - this may take several minutes =====\n"); diff --git a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj b/samples/csharp/getting-started/Ranking_Web/WebRanking/WebRanking.csproj similarity index 55% rename from samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj rename to samples/csharp/getting-started/Ranking_Web/WebRanking/WebRanking.csproj index 5a7b44bfe..fbca6af6c 100644 --- a/samples/csharp/getting-started/Ranking_PersonalizedSort/PersonalizedRanking/PersonalizedRanking.csproj +++ b/samples/csharp/getting-started/Ranking_Web/WebRanking/WebRanking.csproj @@ -5,14 +5,6 @@ netcoreapp2.2 - - - - - - - - diff --git a/samples/csharp/v1.0.0-All-Samples.sln b/samples/csharp/v1.0.0-All-Samples.sln index 3488d964c..44487c525 100644 --- a/samples/csharp/v1.0.0-All-Samples.sln +++ b/samples/csharp/v1.0.0-All-Samples.sln @@ -129,9 +129,9 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TFImageClassififcationE2E.S EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TensorFlowImageClassification", "end-to-end-apps\DeepLearning_ImageClassification_TensorFlow\TensorFlowImageClassification\TensorFlowImageClassification.csproj", "{C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PersonalizedRanking.Solution", "PersonalizedRanking.Solution", "{B76DD928-A78E-497C-BA7D-83C5864452F9}" +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "WebRanking.Solution", "WebRanking.Solution", "{B76DD928-A78E-497C-BA7D-83C5864452F9}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PersonalizedRanking", "getting-started\Ranking_PersonalizedSort\PersonalizedRanking\PersonalizedRanking.csproj", "{97ACFE55-AC1F-41D0-B1A7-A0D97440346A}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "WebRanking", "getting-started\Ranking_Web\WebRanking\WebRanking.csproj", "{4EA790BB-76C7-471A-ADE4-6FBD183C461B}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -429,14 +429,14 @@ Global {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|Any CPU.Build.0 = Release|Any CPU {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|x64.ActiveCfg = Release|Any CPU {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47}.Release|x64.Build.0 = Release|Any CPU - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Debug|Any CPU.Build.0 = Debug|Any CPU - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Debug|x64.ActiveCfg = Debug|Any CPU - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Debug|x64.Build.0 = Debug|Any CPU - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Release|Any CPU.ActiveCfg = Release|Any CPU - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Release|Any CPU.Build.0 = Release|Any CPU - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Release|x64.ActiveCfg = Release|Any CPU - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A}.Release|x64.Build.0 = Release|Any CPU + {4EA790BB-76C7-471A-ADE4-6FBD183C461B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {4EA790BB-76C7-471A-ADE4-6FBD183C461B}.Debug|Any CPU.Build.0 = Debug|Any CPU + {4EA790BB-76C7-471A-ADE4-6FBD183C461B}.Debug|x64.ActiveCfg = Debug|Any CPU + {4EA790BB-76C7-471A-ADE4-6FBD183C461B}.Debug|x64.Build.0 = Debug|Any CPU + {4EA790BB-76C7-471A-ADE4-6FBD183C461B}.Release|Any CPU.ActiveCfg = Release|Any CPU + {4EA790BB-76C7-471A-ADE4-6FBD183C461B}.Release|Any CPU.Build.0 = Release|Any CPU + {4EA790BB-76C7-471A-ADE4-6FBD183C461B}.Release|x64.ActiveCfg = Release|Any CPU + {4EA790BB-76C7-471A-ADE4-6FBD183C461B}.Release|x64.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -478,7 +478,7 @@ Global {EA9E37C6-8C62-4370-A9CF-369D002B89B6} = {7C3A7DA5-CBEB-420F-B7AC-CDE34BE2D52E} {F2C0FCE9-9F76-4318-826E-892441E4A169} = {EF9F8695-25DE-4FE4-894A-6DE24E0BDD73} {C5D5BEBF-DC10-4065-A27B-AB56E1ABCA47} = {F59681C2-D829-4538-A41A-568F7A7D07FD} - {97ACFE55-AC1F-41D0-B1A7-A0D97440346A} = {B76DD928-A78E-497C-BA7D-83C5864452F9} + {4EA790BB-76C7-471A-ADE4-6FBD183C461B} = {B76DD928-A78E-497C-BA7D-83C5864452F9} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {98369941-33DD-450C-A410-B9A91C8CDE91}