@@ -3207,3 +3207,102 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
32073207 }
32083208 }
32093209}
3210+
3211+ static int repack_q4_0_to_q4_0_4_bl (struct ggml_tensor * t , int interleave_block , uint8_t * * pmem , size_t * psize ) {
3212+ GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
3213+ GGML_ASSERT (t -> ne [0 ] % 8 == 0 );
3214+ GGML_ASSERT (interleave_block == 4 || interleave_block == 8 );
3215+
3216+ // Do in-place transformation. Allocate scratch buffer
3217+ size_t size = sizeof (block_q4_0x4 ) * t -> ne [0 ] / QK4_0 ;
3218+ if (size > * psize ) {
3219+ uint8_t * new_mem = realloc (* pmem , size );
3220+ if (!new_mem ) {
3221+ return -1 ;
3222+ }
3223+ * pmem = new_mem ;
3224+ * psize = size ;
3225+ }
3226+ block_q4_0x4 * dst = (block_q4_0x4 * ) * pmem ;
3227+ block_q4_0 * src = (block_q4_0 * ) t -> data ;
3228+ block_q4_0 dst_tmp [4 ];
3229+ int n = t -> ne [0 ];
3230+ int nrow = t -> ne [1 ]; // Number of rows
3231+ int nrows_interleaved = 4 ;
3232+ int nblocks = t -> ne [0 ] / QK4_0 ;
3233+ for (int b = 0 ; b < (nrow * n ); b += nrows_interleaved * n ) {
3234+ int cnt = 0 ;
3235+ for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3236+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3237+ dst_tmp [i ] = src [x + i * nblocks ];
3238+ }
3239+ dst [cnt ++ ] = make_block_q4_0x4 (dst_tmp , interleave_block , 0x88 );
3240+ }
3241+ memcpy (src , dst , size );
3242+ src += cnt * 4 ;
3243+ }
3244+ return 0 ;
3245+ }
3246+
3247+ static int repack_q4_0_to_q4_0_8_bl (struct ggml_tensor * t , int interleave_block , uint8_t * * pmem , size_t * psize ) {
3248+ GGML_ASSERT (t -> type == GGML_TYPE_Q4_0 );
3249+ GGML_ASSERT (t -> ne [0 ] % 8 == 0 );
3250+ GGML_ASSERT (interleave_block == 8 );
3251+
3252+ // Do in-place transformation. Allocate scratch buffer
3253+ size_t size = sizeof (block_q4_0x8 ) * t -> ne [0 ] / QK4_0 ;
3254+ if (size > * psize ) {
3255+ uint8_t * new_mem = realloc (* pmem , size );
3256+ if (!new_mem ) {
3257+ return -1 ;
3258+ }
3259+ * pmem = new_mem ;
3260+ * psize = size ;
3261+ }
3262+ block_q4_0x8 * dst = (block_q4_0x8 * ) * pmem ;
3263+ block_q4_0 * src = (block_q4_0 * ) t -> data ;
3264+ block_q4_0 dst_tmp [8 ];
3265+ int n = t -> ne [0 ];
3266+ int nrow = t -> ne [1 ]; // Number of rows
3267+ int nrows_interleaved = 8 ;
3268+ int nblocks = t -> ne [0 ] / QK4_0 ;
3269+ for (int b = 0 ; b < (nrow * n ); b += nrows_interleaved * n ) {
3270+ int cnt = 0 ;
3271+ for (int64_t x = 0 ; x < nblocks ; x ++ ) {
3272+ for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3273+ dst_tmp [i ] = src [x + i * nblocks ];
3274+ }
3275+ dst [cnt ++ ] = make_block_q4_0x8 (dst_tmp , interleave_block , 0x88 );
3276+ }
3277+ memcpy (src , dst , size );
3278+ src += cnt * 4 ;
3279+ }
3280+ return 0 ;
3281+ }
3282+
3283+ // Prepare for optimized kernels if applicable
3284+ void ggml_prepare_optimal_kernel (struct ggml_tensor * cur , uint8_t * * pmem , size_t * psize ) {
3285+ UNUSED (cur );
3286+ UNUSED (pmem );
3287+ UNUSED (psize );
3288+
3289+ #if defined(__ARM_ARCH )
3290+ if (cur -> type == GGML_TYPE_Q4_0 ) {
3291+ if (ggml_cpu_has_sve () && ggml_cpu_has_matmul_int8 () && ggml_cpu_get_sve_cnt () == QK8_0 ) {
3292+ if (repack_q4_0_to_q4_0_8_bl (cur , 8 , pmem , psize ) == 0 ) {
3293+ cur -> type = GGML_TYPE_Q4_0_8_8 ;
3294+ }
3295+ }
3296+ else if (ggml_cpu_has_neon () && ggml_cpu_has_matmul_int8 ()) {
3297+ if (repack_q4_0_to_q4_0_4_bl (cur , 8 , pmem , psize ) == 0 ) {
3298+ cur -> type = GGML_TYPE_Q4_0_4_8 ;
3299+ }
3300+ }
3301+ else if (ggml_cpu_has_neon ()) {
3302+ if (repack_q4_0_to_q4_0_4_bl (cur , 4 , pmem , psize ) == 0 ) {
3303+ cur -> type = GGML_TYPE_Q4_0_4_4 ;
3304+ }
3305+ }
3306+ }
3307+ #endif
3308+ }
0 commit comments