@@ -296,8 +296,22 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
296296#undef GGML_METAL_ADD_KERNEL
297297 }
298298
299- GGML_METAL_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx->device .hasUnifiedMemory ? " true" : " false" );
300299#if TARGET_OS_OSX
300+ // print MTL GPU family:
301+ GGML_METAL_LOG_INFO (" %s : GPU name: %s \n " , __func__, [[ctx->device name ] UTF8String ]);
302+ GGML_METAL_LOG_INFO (" %s : GPU arch: %s \n " , __func__, [[ctx->device architecture ].name UTF8String ]);
303+
304+ // determine max supported GPU family
305+ // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
306+ // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
307+ for (int i = MTLGPUFamilyApple9 + 10 ; i >= MTLGPUFamilyApple1 ; --i) {
308+ if ([ctx->device supportsFamily: i]) {
309+ GGML_METAL_LOG_INFO (" %s : GPU family: MTLGPUFamilyApple%d (%d )\n " , __func__, i - MTLGPUFamilyApple1 + 1 , i);
310+ break ;
311+ }
312+ }
313+
314+ GGML_METAL_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx->device .hasUnifiedMemory ? " true" : " false" );
301315 GGML_METAL_LOG_INFO (" %s : recommendedMaxWorkingSetSize = %8.2f MB\n " , __func__, ctx->device .recommendedMaxWorkingSetSize / 1024.0 / 1024.0 );
302316 if (ctx->device .maxTransferRate != 0 ) {
303317 GGML_METAL_LOG_INFO (" %s : maxTransferRate = %8.2f MB/s\n " , __func__, ctx->device .maxTransferRate / 1024.0 / 1024.0 );
@@ -351,16 +365,18 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
351365 GGML_METAL_DEL_KERNEL (mul_mv_q4_K_f32);
352366 GGML_METAL_DEL_KERNEL (mul_mv_q5_K_f32);
353367 GGML_METAL_DEL_KERNEL (mul_mv_q6_K_f32);
354- GGML_METAL_DEL_KERNEL (mul_mm_f32_f32);
355- GGML_METAL_DEL_KERNEL (mul_mm_f16_f32);
356- GGML_METAL_DEL_KERNEL (mul_mm_q4_0_f32);
357- GGML_METAL_DEL_KERNEL (mul_mm_q8_0_f32);
358- GGML_METAL_DEL_KERNEL (mul_mm_q4_1_f32);
359- GGML_METAL_DEL_KERNEL (mul_mm_q2_K_f32);
360- GGML_METAL_DEL_KERNEL (mul_mm_q3_K_f32);
361- GGML_METAL_DEL_KERNEL (mul_mm_q4_K_f32);
362- GGML_METAL_DEL_KERNEL (mul_mm_q5_K_f32);
363- GGML_METAL_DEL_KERNEL (mul_mm_q6_K_f32);
368+ if ([ctx->device supportsFamily: MTLGPUFamilyApple7]) {
369+ GGML_METAL_DEL_KERNEL (mul_mm_f32_f32);
370+ GGML_METAL_DEL_KERNEL (mul_mm_f16_f32);
371+ GGML_METAL_DEL_KERNEL (mul_mm_q4_0_f32);
372+ GGML_METAL_DEL_KERNEL (mul_mm_q8_0_f32);
373+ GGML_METAL_DEL_KERNEL (mul_mm_q4_1_f32);
374+ GGML_METAL_DEL_KERNEL (mul_mm_q2_K_f32);
375+ GGML_METAL_DEL_KERNEL (mul_mm_q3_K_f32);
376+ GGML_METAL_DEL_KERNEL (mul_mm_q4_K_f32);
377+ GGML_METAL_DEL_KERNEL (mul_mm_q5_K_f32);
378+ GGML_METAL_DEL_KERNEL (mul_mm_q6_K_f32);
379+ }
364380 GGML_METAL_DEL_KERNEL (rope_f32);
365381 GGML_METAL_DEL_KERNEL (rope_f16);
366382 GGML_METAL_DEL_KERNEL (alibi_f32);
@@ -986,32 +1002,36 @@ void ggml_metal_graph_compute(
9861002 } break ;
9871003 case GGML_OP_MUL_MAT:
9881004 {
989- // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
990-
9911005 GGML_ASSERT (ne00 == ne10);
992- // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
993- uint gqa = ne12/ne02;
9941006 GGML_ASSERT (ne03 == ne13);
9951007
1008+ const uint gqa = ne12/ne02;
1009+
9961010 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
997- // to the matrix-vector kernel. the numbers below are measured on M2 Ultra
998- // not sure if this translates across all chips
1011+ // to the matrix-vector kernel
9991012 int ne11_mm_min = 1 ;
10001013
1001- switch (src0t) {
1002- case GGML_TYPE_F16: ne11_mm_min = 2 ; break ;
1003- case GGML_TYPE_Q8_0: ne11_mm_min = 7 ; break ;
1004- case GGML_TYPE_Q2_K: ne11_mm_min = 15 ; break ;
1005- case GGML_TYPE_Q3_K: ne11_mm_min = 7 ; break ;
1006- case GGML_TYPE_Q4_0:
1007- case GGML_TYPE_Q4_1: ne11_mm_min = 15 ; break ;
1008- case GGML_TYPE_Q4_K: ne11_mm_min = 11 ; break ;
1009- case GGML_TYPE_Q5_0: // not tested yet
1010- case GGML_TYPE_Q5_1: ne11_mm_min = 13 ; break ; // not tested yet
1011- case GGML_TYPE_Q5_K: ne11_mm_min = 7 ; break ;
1012- case GGML_TYPE_Q6_K: ne11_mm_min = 7 ; break ;
1013- default : ne11_mm_min = 1 ; break ;
1014+ #if 0
1015+ // the numbers below are measured on M2 Ultra for 7B and 13B models
1016+ // these numbers do not translate to other devices or model sizes
1017+ // TODO: need to find a better approach
1018+ if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
1019+ switch (src0t) {
1020+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
1021+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
1022+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
1023+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
1024+ case GGML_TYPE_Q4_0:
1025+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
1026+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
1027+ case GGML_TYPE_Q5_0: // not tested yet
1028+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
1029+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
1030+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
1031+ default: ne11_mm_min = 1; break;
1032+ }
10141033 }
1034+ #endif
10151035
10161036 // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
10171037 // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
0 commit comments