diff --git a/examples/simple/simple-backend-tsi.cpp b/examples/simple/simple-backend-tsi.cpp index 254d6862624b7..680d5e4a5359b 100644 --- a/examples/simple/simple-backend-tsi.cpp +++ b/examples/simple/simple-backend-tsi.cpp @@ -39,6 +39,8 @@ float test_input_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { {1.1, -4.4, 10, -5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, -23, 24, 25, -26, 27, -28, 29, -30, 31, -32.6}, //SIN Kernel {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6}, + //RMS_NORM Kernel + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, //SIGMOID Kernel need to fix not tested {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6}, //SILU Kernel @@ -64,6 +66,8 @@ float test_input_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, //SIN Kernel input not used {1.1, 2.2, 5, 10, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + //RMS_NORM Kernel input is not used + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, //SIGMOID Kernel not used {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6}, //SILU Kernel not used @@ -89,11 +93,13 @@ float test_result[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS] = { {1.1, 4.4, 10, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32.6}, //SIN Kernel {0.891207, -0.951602, -0.544021, -0.958924, -0.958924, -0.279416, 0.656987, 0.989358, 0.412118, -0.544021, -0.999990, -0.536573, 0.420167, 0.990607, 0.650288, -0.287903, -0.961398, -0.750987, 0.149877, 0.912945, 0.912945, 0.912945, -0.846220, -0.905578, -0.132352, 0.762559, 0.956376, 0.270906, -0.663634, -0.988032, -0.404039, 0.926149}, + //RMS_NORM Kernel + {0.052888, 0.105776, 0.158664, 0.211552, 0.264440, 0.317328, 0.370216, 0.423104, 0.475992, 0.528880, 0.581768, 0.634656, 0.687544, 0.740432, 0.793320, 0.846208, 0.899096, 0.951984, 1.004872, 1.057760, 1.110648, 1.163536, 1.216424, 1.269312, 1.322200, 1.375088, 1.427976, 1.480864, 1.533752, 1.586640, 1.639528, 1.692416}, //SIGMOID Kernel not tested {0.891207, -0.951602, -0.544021, -0.958924, -0.958924, -0.279416, 0.656987, 0.989358, 0.412118, -0.544021, -0.999990, -0.536573, 0.420167, 0.990607, 0.650288, -0.287903, -0.961398, -0.750987, 0.149877, 0.912945, 0.912945, 0.912945, -0.846220, -0.905578, -0.132352, 0.762559, 0.956376, 0.270906, -0.663634, -0.988032, -0.404039, 0.926149}, // SILU Kernel {-0.000002, -0.000005, -0.000012, -0.000029, -0.000074, -0.000184, -0.000454, -0.001111, -0.002683, -0.006377, -0.014836, -0.033464, -0.071945, -0.142278, -0.238406, -0.268941, 0.000000, 0.731059, 1.761594, 2.857722, 3.928055, 4.966536, 5.985164, 6.993623, 7.997317, 8.998889, 9.999546, 10.999816, 11.999926, 12.999971, 13.999988, 14.999995} - + }; float test_input_scale_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = { @@ -151,6 +157,12 @@ float test_input_scale_1[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //RMS_NORM Kernel + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, //SIGMOID KERNEL need to fix input data {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -217,6 +229,12 @@ float test_input_scale_2[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = -16, 25, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + //RMS_NORM Kernel input not used + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, //SIGMOID KERNEL input not used {-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -9, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -291,6 +309,24 @@ float test_result_scale[GGML_TSAVORITE_KERNEL_TYPE_COUNT][NUM_ELEMENTS_SCALE] = -0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471}, + //RMS_NORM Kernel + { + 0.054620, 0.109240, 0.163860, 0.218479, 0.273099, 0.327719, 0.382339, 0.436959, 0.491579, 0.546199, + 0.600818, 0.655438, 0.710058, 0.764678, 0.819298, 0.873918, 0.928537, 0.983157, 1.037777, 1.092397, + 1.147017, 1.201637, 1.256257, 1.310876, 1.365496, 1.420116, 1.474736, 1.529356, 1.583976, 1.638596, + 1.693215, 1.747835, 0.054620, 0.109240, 0.163860, 0.218479, 0.273099, 0.327719, 0.382339, 0.436959, + 0.491579, 0.546199, 0.600818, 0.655438, 0.710058, 0.764678, 0.819298, 0.873918, 0.928537, 0.983157, + 1.037777, 1.092397, 1.147017, 1.201637, 1.256257, 1.310876, 1.365496, 1.420116, 1.474736, 1.529356, + 1.583976, 1.638596, 1.693215, 1.747835, 0.054620, 0.109240, 0.163860, 0.218479, 0.273099, 0.327719, + 0.382339, 0.436959, 0.491579, 0.546199, 0.600818, 0.655438, 0.710058, 0.764678, 0.819298, 0.873918, + 0.928537, 0.983157, 1.037777, 1.092397, 1.147017, 1.201637, 1.256257, 1.310876, 1.365496, 1.420116, + 1.474736, 1.529356, 1.583976, 1.638596, 1.693215, 1.747835, 0.054620, 0.109240, 0.163860, 0.218479, + 0.273099, 0.327719, 0.382339, 0.436959, 0.491579, 0.546199, 0.600818, 0.655438, 0.710058, 0.764678, + 0.819298, 0.873918, 0.928537, 0.983157, 1.037777, 1.092397, 1.147017, 1.201637, 1.256257, 1.310876, + 1.365496, 1.420116, 1.474736, 1.529356, 1.583976, 1.638596, 1.693215, 1.747835, 0.054620, 0.109240, + 0.163860, 0.218479, 0.273099, 0.327719, 0.382339, 0.436959, 0.491579, 0.546199, 0.600818, 0.655438, + 0.710058, 0.764678, 0.819298, 0.873918, 0.928537, 0.983157, 1.037777, 1.092397, 1.147017, 1.201637, + 1.256257, 1.310876, 1.365496}, // SIGMOID KERNEL, result need to change {-0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, 0.841471, @@ -335,14 +371,15 @@ static void ggml_log_callback_default(ggml_log_level level, const char * text, v } -// --- FLOAT COMPARATOR +// --- FLOAT COMPARATOR static bool ggml_tsi_compare_two_float(float a, float b) { // For very small values, use absolute error if (fabsf(a) < 1e-2f && fabsf(b) < 1e-2f) { return fabsf(a - b) < 1e-6f; // Accept up to 1e-6 difference for small values } - // For larger values, use relative error - const float epsilon = 1e-4f; + // For larger values, use relative error with increased tolerance + // Increased to 1e-3 (0.1%) to handle floating-point precision differences + const float epsilon = 1e-3f; // Changed from 1e-4f to 1e-3f float diff = fabsf(a - b); float max_val = fmaxf(fabsf(a), fabsf(b)); return diff < epsilon * max_val; @@ -376,7 +413,7 @@ static bool load_model(simple_model & model, float * a, float * b, enum ggml_typ /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - fprintf(stderr, "\n Calculating mem_size %ld %d and creating ggml context \n", ggml_tensor_overhead(), num_tensors); + fprintf(stderr, "\n Calculating mem_size %ld %d and creating ggml context \n", ggml_tensor_overhead(), num_tensors); // create context model.ctx = ggml_init(params); @@ -475,6 +512,11 @@ static struct ggml_cgraph * build_graph(const simple_model& model, enum ggml_tsa case GGML_TSAVORITE_KERNEL_TYPE_SIN: result = ggml_sin(ctx0, model.a); break; + case GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM: + printf("\n ANOOP CALLINF RMS_NORM\n"); + //result = ggml_rms_norm(ctx0, model.a, 1e-6f); + result = ggml_rms_norm(ctx0, model.a, 1e-5); + break; case GGML_TSAVORITE_KERNEL_TYPE_SIGMOID: result = ggml_sigmoid(ctx0, model.a); break; @@ -500,11 +542,11 @@ static struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t a fprintf(stderr, "\n Under Test case for compute API creating build_graph \n"); struct ggml_cgraph * gf = build_graph(model, ops_type); - if (!gf) { + if (!gf) { fprintf(stderr, "\ncompute failed\n"); return NULL; } - + // allocate tensors ggml_gallocr_alloc_graph(allocr, gf); @@ -533,6 +575,8 @@ enum ggml_tsavorite_kernel_type convert_testcase_to_ops_type (const char *testCa return GGML_TSAVORITE_KERNEL_TYPE_ABS; else if (!strcmp(testCase,"sin")) return GGML_TSAVORITE_KERNEL_TYPE_SIN; + else if (!strcmp(testCase,"rms_norm")) + return GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM; else if (!strcmp(testCase,"sigmoid")) return GGML_TSAVORITE_KERNEL_TYPE_SIGMOID; else if (!strcmp(testCase,"silu")) @@ -561,7 +605,10 @@ const char* convert_ops_type_to_testcase(enum ggml_tsavorite_kernel_type ops_typ return "neg"; case GGML_TSAVORITE_KERNEL_TYPE_ABS: return "abs"; - case GGML_TSAVORITE_KERNEL_TYPE_SIN: + case GGML_TSAVORITE_KERNEL_TYPE_SIN: + return "sin"; + case GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM: + return "rms_norm"; return "sin"; case GGML_TSAVORITE_KERNEL_TYPE_SIGMOID: return "sigmoid"; @@ -601,26 +648,27 @@ int main(int argc, char *argv[]) { ops_type == GGML_TSAVORITE_KERNEL_TYPE_NEG || ops_type == GGML_TSAVORITE_KERNEL_TYPE_ABS || ops_type == GGML_TSAVORITE_KERNEL_TYPE_SIN || + ops_type == GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM || ops_type == GGML_TSAVORITE_KERNEL_TYPE_SIGMOID || ops_type == GGML_TSAVORITE_KERNEL_TYPE_SILU) num_of_input_tensors = NUM_INPUT_URINARY_TENSORS; - else + else num_of_input_tensors = NUM_INPUT_TENSORS; if (data_scale) { input1[ops_type] = test_input_scale_1[ops_type]; - elements_A = NUM_ELEMENTS_SCALE; + elements_A = NUM_ELEMENTS_SCALE; if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) { input2[ops_type] = test_input_scale_2[ops_type]; - elements_B = NUM_ELEMENTS_SCALE; + elements_B = NUM_ELEMENTS_SCALE; } result_data[ops_type] = test_result_scale[ops_type]; } else { input1[ops_type] = test_input_1[ops_type]; - elements_A = NUM_ELEMENTS; + elements_A = NUM_ELEMENTS; if (num_of_input_tensors != NUM_INPUT_URINARY_TENSORS) { input2[ops_type] = test_input_2[ops_type]; - elements_B = NUM_ELEMENTS; + elements_B = NUM_ELEMENTS; } result_data[ops_type] = test_result[ops_type]; } @@ -676,7 +724,7 @@ int main(int argc, char *argv[]) { uint32_t bits_expected, bits_actual; memcpy(&bits_expected, &result_data[ops_type][i], sizeof(float)); memcpy(&bits_actual, &out_data[i], sizeof(float)); - fprintf(stderr, "Index %d: expected bits %08x, actual bits %08x\n", i, bits_expected, bits_actual); + //fprintf(stderr, "Index %d: expected bits %08x, actual bits %08x\n", i, bits_expected, bits_actual); #endif if (ggml_tsi_compare_two_float(out_data[i], result_data[ops_type][i])) { continue; diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h index b4bfdc05b0184..1932bfa498abc 100644 --- a/ggml/include/ggml-tsavorite.h +++ b/ggml/include/ggml-tsavorite.h @@ -126,8 +126,10 @@ enum ggml_tsavorite_kernel_type { GGML_TSAVORITE_KERNEL_TYPE_NEG, GGML_TSAVORITE_KERNEL_TYPE_ABS, GGML_TSAVORITE_KERNEL_TYPE_SIN, + GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM, GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, GGML_TSAVORITE_KERNEL_TYPE_SILU, + GGML_TSAVORITE_KERNEL_TYPE_MUL_MAT, GGML_TSAVORITE_KERNEL_TYPE_COUNT }; @@ -162,10 +164,16 @@ extern void _mlir_ciface_txe_abs_host(void *a, void *res); extern void _mlir_ciface_txe_sin_host(void *a, void *res); extern void _mlir_ciface_txe_sigmoid_host(void *a, void *res); extern void _mlir_ciface_txe_silu_host(void *a, void *res); +extern void _mlir_ciface_txe_mul_mat_host(void *a, void *b, void *res, void *pre_mask); +extern void _mlir_ciface_txe_rms_norm_host(void *a, void *res, void *buf); +extern void _mlir_ciface_txe_rms_norm_6_host(void *a, void *res, void *buf); +extern void _mlir_ciface_txe_rms_norm_512_host(void *a, void *res, void *buf); + extern void ggml_tsi_log_tensor_data(tensor_log log_data); #define NUM_OF_TXES 1 -#define MEM_REF_DESCRIPTOR_RANK 1 +#define MEM_REF_DESCRIPTOR_RANK 4 +#define TSI_TVU_LOAD_SIZE 32 // // backend API diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index e0e31363e4888..71d8e71a1a5d5 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -861,6 +861,10 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru } } +static void anoop_backend() +{ + return; +} // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits @@ -875,6 +879,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg }; ggml_free(sched->ctx); + //printf("\n\n ANOOP ggml_backend_sched_split_graph is called\n\n"); sched->ctx = ggml_init(params); if (sched->ctx == NULL) { @@ -932,6 +937,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } int * node_backend_id = &tensor_backend_id(node); + + if (node && node->op == GGML_OP_RMS_NORM) { + if ((node->ne[1] == 1 || node->ne[1] == 6 || node->ne[1] == 512) && node->ne[2] == 1 && (node->ne[3] == 1)) { + ggml_backend_sched_set_if_supported(sched, node, 0, node_backend_id); + //anoop_backend(); + } + } if (*node_backend_id != -1) { if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) @@ -942,6 +954,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // Below Code is Optimization which i am disabling for now since we have not implemented other // Operation at tsavorite } else { + //if (node && node->op == GGML_OP_RMS_NORM) + // printf("\n ANOOP RMS COUNT -First STEP"); ggml_backend_sched_set_if_supported(sched, node, 0, node_backend_id); } } @@ -955,6 +969,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } int * node_backend_id = &tensor_backend_id(node); +#if 0 + if (node && node->op == GGML_OP_RMS_NORM) { + if ((node->ne[1] == 1 || node->ne[1] == 512) && node->ne[2] == 1 && (node->ne[3] == 1)) { + ggml_backend_sched_set_if_supported(sched, node, 0, node_backend_id); + //anoop_backend(); + } + } +#endif if (*node_backend_id != -1) { if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) @@ -962,7 +984,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else if (cur_backend_id != -1) { + } else if (cur_backend_id != -1) { + if (cur_backend_id != 0) + printf("\n AT GRAPH SPLIT expand gpu up"); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -976,9 +1000,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } int * node_backend_id = &tensor_backend_id(node); + +#if 0 + if (node && node->op == GGML_OP_RMS_NORM) { + if ((node->ne[1] == 1 || node->ne[1] == 512) && node->ne[2] == 1 && (node->ne[3] == 1)) { + ggml_backend_sched_set_if_supported(sched, node, 0, node_backend_id); + //anoop_backend(); + } + } +#endif if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { + //if (cur_backend_id != 0) + // printf("\n AT GRAPH SPLIT expand rest down"); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } @@ -992,9 +1027,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } int * node_backend_id = &tensor_backend_id(node); + + if (node && node->op == GGML_OP_RMS_NORM) { + if ((node->ne[1] == 1 || node->ne[1] == 512 || node->ne[1] == 6) && node->ne[2] == 1 && (node->ne[3] == 1)) { + ggml_backend_sched_set_if_supported(sched, node, 0, node_backend_id); + anoop_backend(); + } + } if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; } else if (cur_backend_id != -1) { + if (cur_backend_id != 0) + printf("\n AT GRAPH SPLIT expand rest up"); ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 221182445ea34..16309ccfb4049 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1297,6 +1297,7 @@ static void ggml_compute_forward_mul_mat( const bool src1_cont = ggml_is_contiguous(src1); if (src1_cont) { + //printf("\n ANOOP GGML IS CONTIGIOUS\n"); for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) if (!llamafile_sgemm(params, @@ -1813,6 +1814,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_RMS_NORM: { + //printf("\n under CPU GGML_OP_RMS_NORM 1\n"); ggml_compute_forward_rms_norm(params, tensor); } break; case GGML_OP_RMS_NORM_BACK: diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index ae36728a8d3ce..77cc8ecb447a5 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -62,7 +62,10 @@ struct _txe_device_t { }; struct _txe_compute_pipeline_state_t { + void (*_mlir_fptr_3_input)(void *, void *, void *, void *); void (*_mlir_fptr_2_input)(void *, void *, void *); + void (*_mlir_fptr_2_input_6)(void *, void *, void *); + void (*_mlir_fptr_2_input_512)(void *, void *, void *); void (*_mlir_fptr_1_input)(void *, void *); std::string kernel_name; int reserved; @@ -402,6 +405,11 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_ kernel_pipeline->kernel_name = "TXE_MULT"; flag = true; break; + case GGML_TSAVORITE_KERNEL_TYPE_MUL_MAT: + kernel_pipeline->_mlir_fptr_3_input = &_mlir_ciface_txe_mul_mat_host; + kernel_pipeline->kernel_name = "TXE_MUL_MAT"; + flag = true; + break; case GGML_TSAVORITE_KERNEL_TYPE_DIV: kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_div_host; kernel_pipeline->kernel_name = "TXE_DIV"; @@ -442,6 +450,13 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_ kernel_pipeline->kernel_name = "TXE_SILU"; flag = true; break; + case GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM: + kernel_pipeline->_mlir_fptr_2_input = &_mlir_ciface_txe_rms_norm_host; + kernel_pipeline->_mlir_fptr_2_input_6 = &_mlir_ciface_txe_rms_norm_6_host; + kernel_pipeline->_mlir_fptr_2_input_512 = &_mlir_ciface_txe_rms_norm_512_host; + kernel_pipeline->kernel_name = "TXE_RMS_NORM"; + flag = true; + break; default: break; } @@ -595,6 +610,8 @@ static struct ggml_backend_tsavorite_context *ggml_tsavorite_init(ggml_backend_d GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIN, true); GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SIGMOID, true); GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_SILU, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_MUL_MAT, true); + GGML_TSAVORITE_KERNEL(GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM, true); } GGML_TSAVORITE_LOG_INFO("End %s\n", __func__); @@ -692,10 +709,12 @@ static bool ggml_tsavorite_supports_op(const struct ggml_backend_tsavorite_devic case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_MUL: + //case GGML_OP_MUL_MAT: case GGML_OP_DIV: case GGML_OP_SQRT: case GGML_OP_SQR: case GGML_OP_SIN: + case GGML_OP_RMS_NORM: break; case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -744,6 +763,35 @@ static void ggml_tsavorite_decompose_unary_kernel(uint32_t num_elem, ggml_tensor return; } +static void anoop() { + return; +} + +template +// Assumes tsi_alloc is available and returns a pointer to allocated memory +static MemRefDescriptor* create_pred_mask(int K) { + // TVU load size (e.g., 32 for 1024-bit vector with 32-bit elements) + const int32_t tvu_size = TSI_TVU_LOAD_SIZE; + //printf("\n ANOOP Print Rank %d and K %d \n", Rank, K); + + // Round up K to the next multiple of tvu_size + int32_t num_of_elem = ((K % tvu_size) != 0) ? ((K / tvu_size) + 1) * tvu_size : K; + + // Allocate memory dynamically: space for header + data + MemRefDescriptor* header = (MemRefDescriptor*) tsi_alloc( + sizeof(MemRefDescriptor) + num_of_elem * sizeof(float) + ); + + // Advance pointer to skip header and get to data + int32_t* pred_mask_data = (int32_t*)(header + 1); + + // Fill the mask: 1 for indices < K, 0 otherwise + for (int32_t i = 0; i < num_of_elem; ++i) { + pred_mask_data[i] = (i < K) ? 1 : 0; + } + return header; +} + // nodes are intermediate which has multiple src tensors & operation // Here we create multiple thread // Each Thread run the command buffer & pick Tensor and execute and get the result back base on @@ -837,6 +885,11 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, kernel_type = GGML_TSAVORITE_KERNEL_TYPE_MULT; num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; break; + case GGML_OP_MUL_MAT: + printf("\n AT COMPUTE OF MUL_MAT\n"); + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_MUL_MAT; + num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; + break; case GGML_OP_DIV: kernel_type = GGML_TSAVORITE_KERNEL_TYPE_DIV; num_of_input_tensors = TSAVORITE_TWO_INPUT_TENSORS; @@ -853,6 +906,11 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, kernel_type = GGML_TSAVORITE_KERNEL_TYPE_SIN; num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; break; + case GGML_OP_RMS_NORM: + //printf("\n AT COMPUTE OF RMS_NORM\n"); + kernel_type = GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM; + num_of_input_tensors = TSAVORITE_UNARY_INPUT_TENSORS; + break; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_NEG: @@ -884,7 +942,8 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, } if (!ctx->kernels[kernel_type].pipeline || - (!ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input && + (!ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input && + !ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input && !ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input)) { GGML_TSAVORITE_LOG_ERROR("Kernel Type %d, not supported \n", kernel_type); return GGML_STATUS_ABORTED; @@ -965,6 +1024,38 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, log_data.tensor = src1; ggml_tsi_log_tensor_data(log_data); } + if (kernel_type == GGML_TSAVORITE_KERNEL_TYPE_MUL_MAT) { + printf("\n ANOOP I am calling MUL_MAT KERNEL\n"); + int K = src0->ne[0]; + // tsi_alloc inside below function + anoop(); + MemRefDescriptor* pred_mask = create_pred_mask(K); + + if (!pred_mask) { + GGML_TSAVORITE_LOG_ERROR("tsi_alloc failied for creating memory for pred_mask \n"); + return GGML_STATUS_ABORTED; + } + pred_mask->offset = 0; + pred_mask->data = (void *)(pred_mask+1); + + for(int i=0; i < 4; ++i) { + srcP0->shape[i] = src0->ne[i]; + srcP1->shape[i] = src1->ne[i]; + nodeP->shape[i] = node->ne[i]; + pred_mask->shape[i] = 0; + printf("\n ANOOP src0 ne size %d for index %d nb %d\n", src0->ne[i], i, src0->nb[i]); + printf("\n ANOOP src1 ne size %d for index %d nb %d\n", src1->ne[i], i, src1->nb[i]); + printf("\n ANOOP node ne size %d for index %d nb %d\n", node->ne[i], i, node->nb[i]); + } + // kernel call + printf("\n ANOOP Before MUL MAT done checking contigious src0 %d src1 %d node %d \n",ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(node)); + anoop(); + ctx->kernels[kernel_type].pipeline->_mlir_fptr_3_input(srcP0, srcP1, nodeP, pred_mask); + printf("\n ANOOP After MUL MAT done\n"); + anoop(); + ++device->stats.op_run_count[kernel_type].num_of_kernel_call; + //tsi_free(pred_mask); + } else { ggml_tensor *dst = node; const int nr = ggml_nrows(src0); @@ -986,9 +1077,13 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, float *src1_ptr = (float *)((char *)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11); for (int64_t r = 0; r < nr0; ++r) { - srcP0->shape[Rank - 1] = ne10; - srcP1->shape[Rank - 1] = ne10; - nodeP->shape[Rank - 1] = ne10; + //srcP0->shape[Rank - 1] = ne10; + //srcP1->shape[Rank - 1] = ne10; + //nodeP->shape[Rank - 1] = ne10; + srcP0->shape[0] = ne10; + srcP1->shape[0] = ne10; + nodeP->shape[0] = ne10; + srcP1->data = srcP1->base = (void *)(src1_ptr); srcP0->data = srcP0->base = (void *)(src0_ptr + r * ne10); nodeP->data = nodeP->base = (void *)(dst_ptr + r * ne10); @@ -997,6 +1092,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, ++device->stats.op_run_count[kernel_type].num_of_kernel_call; } } + } if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { log_data.data_type = GGML_TSAVORITE_TENSOR_NODE; @@ -1020,6 +1116,7 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, (struct ggml_backend_tsavorite_device_context *)backend->device->context); return GGML_STATUS_ABORTED; } + //printf("\n op type %d and ne size %d %d %d %d",node->op, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); srcP0 = (MemRefDescriptor *)src0->data; nodeP = (MemRefDescriptor *)node->data; // This is for tsavorite MemRef Header hence getting header @@ -1058,12 +1155,66 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, srcP0->data = srcP0->base = (void *)((float *)src0->data); nodeP->data = nodeP->base = (void *)((float *)node->data); - srcP0->shape[Rank - 1] = num_elem_src0; - nodeP->shape[Rank - 1] = num_elem_src0; - srcP0->strides[Rank - 1] = 0; - nodeP->strides[Rank - 1] = 0; - // kernel call - ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input(srcP0, nodeP); + //srcP0->shape[Rank - 1] = num_elem_src0; + //nodeP->shape[Rank - 1] = num_elem_src0; + //srcP0->strides[Rank - 1] = 0; + //nodeP->strides[Rank - 1] = 0; + srcP0->shape[0] = num_elem_src0; + nodeP->shape[0] = num_elem_src0; + srcP0->strides[0] = 0; + nodeP->strides[0] = 0; + + if (kernel_type == GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM) { + int K = 96; + // tsi_alloc inside below function + MemRefDescriptor* buf = create_pred_mask(K); + + if (!buf) { + GGML_TSAVORITE_LOG_ERROR("tsi_alloc failied for creating memory for buf \n"); + return GGML_STATUS_ABORTED; + } + buf->offset = 0; + buf->data = buf->base = (void *)(buf+1); + float *val = (float *)buf->data; + int i =64; + for(; i <= 95; ++i) + val[i] = node->ne[0]; + + + for ( i = 0; i < GGML_MAX_DIMS && src0->nb[i] != 0; ++i) { + if (src0->ne[i] == 0) { + srcP0->shape[i] = 1; + nodeP->shape[i] = 1; + } + else { + srcP0->shape[i] = src0->ne[i]; + nodeP->shape[i] = node->ne[i]; + } + srcP0->strides[i] = 0; + nodeP->strides[i] = 0; + } + anoop(); + + //printf("\n size of tensor for RMS_NORM %d", num_elem_src0); + //ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input(srcP0, nodeP, buf); + //printf("\n RSM SIZE ne0 %d ne1 %d ne2 %d ne3 %d ", src0->ne[0], src0->ne1[1], src0->ne[2], src0->ne[3]); + if(src0->ne[1] == 512) { + ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input_512(srcP0, nodeP, buf); + printf("\n ANOOP TSAVORITE COMPUTE RSM 512 ne0 %d ne2 %d ne3 %d", src0->ne[0], src0->ne[2], src0->ne[3]); + } + if(src0->ne[1] == 6) { + ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input_6(srcP0, nodeP, buf); + printf("\n ANOOP TSAVORITE COMPUTE RSM 6 ne0 %d ne2 %d ne3 %d", src0->ne[0], src0->ne[2], src0->ne[3]); + } + if(src0->ne[1] == 1) { + ctx->kernels[kernel_type].pipeline->_mlir_fptr_2_input(srcP0, nodeP, buf); + printf("\n ANOOP TSAVORITE COMPUTE RSM 1 ne0 %d ne2 %d ne3 %d", src0->ne[0], src0->ne[2], src0->ne[3]); + } + + anoop(); + } + else + ctx->kernels[kernel_type].pipeline->_mlir_fptr_1_input(srcP0, nodeP); ++device->stats.op_run_count[kernel_type].num_of_kernel_call; if (ggml_tsavorite_log_type_val == GGML_TSAVORITE_LOG_DEBUG) { @@ -1363,7 +1514,7 @@ static size_t ggml_backend_tsavorite_buffer_type_get_alloc_size(ggml_backend_buf "\n\n\n\n Calculating---- Alloc ----Size header %lu and data %lu \n\n\n\n ", sizeof(tensor_data_header), ggml_nbytes(tensor)); - return (sizeof(tensor_data_header) + ggml_nbytes(tensor)); + return (sizeof(tensor_data_header) + ggml_nbytes(tensor) + 1024); TSI_UNUSED(buft); } @@ -1784,9 +1935,11 @@ static bool ggml_backend_tsavorite_device_offload_op(ggml_backend_dev_t dev, case GGML_OP_SUB: case GGML_OP_DIV: case GGML_OP_MUL: + //case GGML_OP_MUL_MAT: case GGML_OP_SQRT: case GGML_OP_SQR: case GGML_OP_SIN: + case GGML_OP_RMS_NORM: break; case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 96e1a05440e22..4ee7614bbf9b6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2624,34 +2624,6 @@ llama_perf_context_data llama_perf_context(const llama_context * ctx) { #ifdef GGML_PERF -void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { - LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); - LLAMA_LOG_TSAVORITE(" %-16s %7s %14s %16s\n", "Op", "Runs", "Total us", "Avg us"); - - for (int i = 0; i < GGML_OP_COUNT; ++i) { - if (totals[i].runs > 0) { - LLAMA_LOG_TSAVORITE(" %-16s %7ld %14ld %16.2f\n", - totals[i].op_name ? totals[i].op_name : "UNKNOWN", - totals[i].runs, - totals[i].total_us, - (double)totals[i].total_us / totals[i].runs); - } - - // Unary sub-op breakdown - if (i == GGML_OP_UNARY) { - for (int j = 0; j < GGML_UNARY_OP_COUNT; ++j) { - if (totals[i].unary_subtotals[j].runs > 0) { - LLAMA_LOG_TSAVORITE(" -> %-11s %7ld %14ld %16.2f\n", - ggml_unary_op_name((enum ggml_unary_op) j), - totals[i].unary_subtotals[j].runs, - totals[i].unary_subtotals[j].total_us, - (double)totals[i].unary_subtotals[j].total_us / totals[i].unary_subtotals[j].runs); - } - } - } - } -} -#elif GGML_PERF_DETAIL void ggml_perf_print_totals(struct ggml_perf_totals totals[GGML_OP_COUNT]) { LLAMA_LOG_TSAVORITE("\n=== GGML Perf Summary ===\n"); LLAMA_LOG_TSAVORITE(" %-16s %-8s %7s %14s %16s\n", "Op", "Target", "Runs", "Total us", "Avg us"); diff --git a/tsi-pkg-build.sh b/tsi-pkg-build.sh index 9dcb367001ddf..0224355fa4886 100755 --- a/tsi-pkg-build.sh +++ b/tsi-pkg-build.sh @@ -3,40 +3,40 @@ set -e #Ensure prerequisites are met as follows echo 'updating submodule' -git submodule update --recursive --init -cd ggml-tsi-kernel/ +#git submodule update --recursive --init +#cd ggml-tsi-kernel/ module load tsi4 gcc/13.3.0 export MLIR_SDK_VERSION=/proj/rel/sw/sdk-r.0.1.8 echo 'creating python virtual env' -/proj/local/Python-3.10.12/bin/python3 -m venv blob-creation -source blob-creation/bin/activate +#/proj/local/Python-3.10.12/bin/python3 -m venv blob-creation +#source blob-creation/bin/activate echo 'installing mlir and python dependencies' -pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt -pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.4.1-py3-none-any.whl -pip install onnxruntime-training +#pip install -r ${MLIR_SDK_VERSION}/compiler/python/requirements-common.txt +#pip install ${MLIR_SDK_VERSION}/compiler/python/mlir_external_packages-1.4.1-py3-none-any.whl +#pip install onnxruntime-training #build TSI kernels for the Tsavorite backend #First for FPGA echo 'creating fpga kernel' -cd fpga-kernel -cmake -B build-fpga -./create-all-kernels.sh +#cd fpga-kernel +#cmake -B build-fpga +#./create-all-kernels.sh #The for Posix Use cases echo 'creating posix kernel' -cd ../posix-kernel/ -./create-all-kernels.sh +#cd ../posix-kernel/ +#./create-all-kernels.sh #Change directory to top level llama.cpp -cd ../../ +#cd ../../ #Compile for posix with build-posix as a target folder echo 'building llama.cp, ggml for tsavorite and other binary for posix' -cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF" -cmake --build build-posix --config Release +#cmake -B build-posix -DGGML_TSAVORITE=ON -DGGML_TSAVORITE_TARGET=posix -DCMAKE_C_FLAGS="-DGGML_PERF" -DCMAKE_CXX_FLAGS="-DGGML_PERF" +#cmake --build build-posix --config Release #Compile for fpga with build-fpga as a target folder