diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 434023dd22ab3..240e8a1b2c025 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -894,14 +894,13 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, } /** - * @brief Get or expand a cached float32 tensor filled with a scalar value. + * @brief Get or expand a cached tensor filled with a scalar value. * - * This function manages cached device memory for float32 tensors. If the current + * This function manages cached device memory for tensors. If the current * cache size is insufficient for the requested tensor shape, the old memory will - * be released and new memory will be allocated. The allocated buffer is then - * initialized either with zeros (when @p value == 0.0f) or with the given scalar - * value using CANN operations. Finally, an aclTensor object is created from the - * cached memory and returned. + * be released and new memory will be allocated. The allocated buffer is + * initialized with the given scalar value using CANN operations. + * Finally, an aclTensor object is created from the cached memory and returned. * * @param ctx The CANN backend context that manages device memory. * @param buffer A pointer to the cached device buffer (will be allocated @@ -910,17 +909,19 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, * updated when the cache is expanded. * @param ne The tensor shape array (number of elements in each dimension). * @param nb The stride size for each dimension. + * @param dtype Data type of cached tensor. * @param dims The number of tensor dimensions. * @param value The scalar value used to fill the tensor (supports zero * initialization via memset or arbitrary values via fill_scalar). * @return An aclTensor pointer created from the cached buffer. */ -static aclTensor* get_f32_cache_acl_tensor( +static aclTensor* get_cache_acl_tensor( ggml_backend_cann_context& ctx, void** buffer, int64_t &cache_element, int64_t* ne, size_t* nb, + ggml_type dtype, int64_t dims, float value) { // Calculate total number of elements @@ -928,7 +929,7 @@ static aclTensor* get_f32_cache_acl_tensor( for (int i = 0; i < dims; i++) { n_element *= ne[i]; } - size_t size = n_element * sizeof(float); + size_t size = n_element * ggml_type_size(dtype); // Allocate or expand cache if needed if (cache_element < n_element) { @@ -941,19 +942,17 @@ static aclTensor* get_f32_cache_acl_tensor( cache_element = n_element; // Initialize cache - if (value == 0.0f) { - ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream())); - } else { - int64_t pool_ne[1] = { n_element }; - size_t pool_nb[1] = { sizeof(float) }; - aclTensor* acl_value = ggml_cann_create_tensor( - *buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1); - aclnn_fill_scalar(ctx, 1, acl_value); - ggml_cann_release_resources(ctx, acl_value); - } + int64_t pool_ne[1] = { n_element }; + size_t pool_nb[1] = { ggml_type_size(dtype) }; + aclTensor* acl_value = ggml_cann_create_tensor( + *buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), + pool_ne, pool_nb, 1); + aclnn_fill_scalar(ctx, value, acl_value); + ggml_cann_release_resources(ctx, acl_value); } - return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims); + return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), + ggml_type_size(dtype), ne, nb, dims); } void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { @@ -965,35 +964,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { float eps; memcpy(&eps, dst->op_params, sizeof(float)); - // build gamma, one... + // build gamma. size_t acl_gamma_nb[GGML_MAX_DIMS]; - acl_gamma_nb[0] = sizeof(float); + // gamma's type is the same with dst. + acl_gamma_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1]; } - aclTensor* acl_gamma = get_f32_cache_acl_tensor( + aclTensor* acl_gamma = get_cache_acl_tensor( ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, src->ne, acl_gamma_nb, + dst->type, 1, // dims 1.0f // value ); - // build rstd, zero... + // build rstd. int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]}; size_t acl_rstd_nb[GGML_MAX_DIMS - 1]; + // rstd will always be F32. acl_rstd_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1]; } - aclTensor* acl_rstd = get_f32_cache_acl_tensor( + aclTensor* acl_rstd = get_cache_acl_tensor( ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size, acl_rstd_ne, acl_rstd_nb, + GGML_TYPE_F32, GGML_MAX_DIMS - 1, 0.0f // value ); @@ -1765,33 +1768,35 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src0 = dst->src[0]; // src ggml_tensor* src1 = dst->src[1]; // index + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + switch (src0->type) { - case GGML_TYPE_F32: { - aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, - dst->data, dst->ne, dst->nb, - src1, dst->type); - break; - } - case GGML_TYPE_F16: { - aclTensor* acl_src0 = ggml_cann_create_tensor(src0); - ggml_cann_pool_alloc src_buffer_allocator( - ctx.pool(), ggml_nelements(src0) * sizeof(float)); - void* src_trans_buffer = src_buffer_allocator.get(); - size_t src_trans_nb[GGML_MAX_DIMS]; - src_trans_nb[0] = sizeof(float); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + case GGML_TYPE_F16: + case GGML_TYPE_F32: + if(src0->type == dst->type) { + aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, + dst->data, dst->ne, dst->nb, + src1, dst->type); + } else { + aclTensor* acl_src0 = ggml_cann_create_tensor(src0); + ggml_cann_pool_alloc src_buffer_allocator( + ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst)); + void* src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = dst->nb[0]; + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor* src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + src0->ne, src_trans_nb, GGML_MAX_DIMS); + aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type)); + aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, + dst->data, dst->ne, dst->nb, + src1, dst->type); + ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor); } - aclTensor* src_trans_tensor = ggml_cann_create_tensor( - src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type), - src0->ne, src_trans_nb, GGML_MAX_DIMS); - aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type)); - aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, - dst->data, dst->ne, dst->nb, - src1, dst->type); - ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor); break; - } case GGML_TYPE_Q8_0: { // add 1 dim for bcast mul. size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], @@ -1799,7 +1804,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne; int64_t scale_offset = 0; - // [3,4,5,64] -> [3,4,5,2,32] weight_ne[0] = QK8_0; weight_ne[1] = src0->ne[0] / QK8_0; @@ -1809,7 +1813,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { weight_ne[i] = src0->ne[i - 1]; weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1]; } - // [3,4,5,64] -> [3,4,5,2,1] scale_ne[0] = 1; scale_ne[1] = src0->ne[0] / QK8_0; @@ -1819,18 +1822,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { scale_ne[i] = src0->ne[i - 1]; scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1]; } - // [3,4,5,64] -> [3,4,5,2,32] dequant_ne = weight_ne; - dequant_nb[0] = sizeof(float); + dequant_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS + 1; i++) { dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1]; } - scale_offset = ggml_nelements(src0) * sizeof(int8_t); ggml_cann_pool_alloc dequant_buffer_allocator( - ctx.pool(), ggml_nelements(src0) * sizeof(float)); - + ctx.pool(), ggml_nelements(src0) * ggml_type_size(dst->type)); aclTensor* acl_weight_tensor = ggml_cann_create_tensor( src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, GGML_MAX_DIMS + 1); @@ -1838,16 +1838,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb, GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset); aclTensor* dequant_tensor = ggml_cann_create_tensor( - dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float), + dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1); - aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor); - dequant_nb[0] = sizeof(float); + dequant_nb[0] = ggml_type_size(dst->type); dequant_ne = src0->ne; for (int i = 1; i < GGML_MAX_DIMS; i++) { dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1]; } - aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne, dst->nb, @@ -1965,16 +1963,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, // Only check env once. static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on")); if (weight_to_nz && is_matmul_weight(weight)) { - int64_t acl_stride[2] = {1, transpose_ne[1]}; - - // Reverse ne. - std::reverse(transpose_ne, transpose_ne + n_dims); - - std::vector storageDims = {transpose_ne[0], transpose_ne[1]}; - - acl_weight_tensor = aclCreateTensor( - transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride, - 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data); + acl_weight_tensor = + ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ); } else { acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND); @@ -3178,7 +3168,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){ aclTensor* acl_src0_f16_tensor = nullptr; aclTensor* acl_src1_f16_tensor = nullptr; aclTensor* acl_src2_f16_tensor = nullptr; - aclTensor* acl_dst_f16_tensor = nullptr; // Step 1: cast the src0 (Query) to fp16 if needed ggml_cann_pool_alloc src0_f16_allocator(ctx.pool()); @@ -3216,22 +3205,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){ acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS); - ggml_cann_pool_alloc out_f16_allocator(ctx.pool()); - void* out_f16_buffer = out_f16_allocator.alloc( - ggml_nelements(dst) * faElemSize); - - int64_t* out_f16_ne = src0_bsnd_ne; - size_t out_f16_nb[GGML_MAX_DIMS]; - out_f16_nb[0] = faElemSize; - for(int i = 1; i < GGML_MAX_DIMS; ++i){ - out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1]; - } - - acl_dst_f16_tensor = ggml_cann_create_tensor( - out_f16_buffer, faDataType, faElemSize, - out_f16_ne, out_f16_nb, GGML_MAX_DIMS - ); - // Step 3: create the PSEShift tensor if needed // this tensor is considered as mask (f16) in the llama.cpp aclTensor* bcast_pse_tensor = nullptr; @@ -3334,8 +3307,29 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){ int64_t keyAntiquantMode = 0; int64_t valueAntiquantMode = 0; - // Step 5: launch the FusedInferAttentionScoreV2 kernel. - // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + aclTensor * fa_dst_tensor = nullptr; + aclTensor * acl_dst_tensor = nullptr; + ggml_cann_pool_alloc out_f16_allocator(ctx.pool()); + if (dst->type == GGML_TYPE_F32) { + void* out_f16_buffer = out_f16_allocator.alloc( + ggml_nelements(dst) * faElemSize); + + int64_t* out_f16_ne = src0_bsnd_ne; + size_t out_f16_nb[GGML_MAX_DIMS]; + out_f16_nb[0] = faElemSize; + for(int i = 1; i < GGML_MAX_DIMS; ++i){ + out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1]; + } + + fa_dst_tensor = ggml_cann_create_tensor( + out_f16_buffer, faDataType, faElemSize, + out_f16_ne, out_f16_nb, GGML_MAX_DIMS + ); + } + else { + fa_dst_tensor = ggml_cann_create_tensor(dst); + } GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2, acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v @@ -3357,23 +3351,24 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){ blockSize, antiquantMode, // blockSize, antiquantMode softmaxLseFlag, // softmaxLseFlag keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode - acl_dst_f16_tensor, // attentionOut + fa_dst_tensor, // attentionOut nullptr // softmaxLse ); - // Step 6: post-processing, permute and cast to f32 - aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); - // TODO: when dst is fp16, don't need cast - aclnn_cast(ctx, acl_dst_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type)); - ggml_cann_release_resources(ctx, acl_src0_f16_tensor, - acl_src1_f16_tensor, - acl_src2_f16_tensor, - acl_dst_f16_tensor, - acl_dst_tensor); - if(src3 != nullptr){ - ggml_cann_release_resources(ctx, bcast_pse_tensor); + if (dst->type == GGML_TYPE_F32) { + // Step 6: post-processing, permute and cast to f32 + aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); + aclnn_cast(ctx, fa_dst_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type)); } - }else{ + + ggml_cann_release_resources(ctx, acl_src0_f16_tensor, + acl_src1_f16_tensor, + acl_src2_f16_tensor, + fa_dst_tensor, + acl_dst_tensor, + bcast_pse_tensor); + + } else { GGML_ABORT("Function is not implemented."); } }