Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions docs/backend/CANN.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,17 +293,14 @@ We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers fr

## Environment variable setup

### GGML_CANN_ASYNC_MODE

Enables asynchronous operator submission. Disabled by default.

### GGML_CANN_MEM_POOL

Specifies the memory pool management strategy:
Specifies the memory pool management strategy, Default is vmm.

- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.

- prio: Employs a priority queue-based memory pool management.

- leg: Uses a fixed-size buffer pool.

### GGML_CANN_DISABLE_BUF_POOL_CLEAN
Expand All @@ -312,9 +309,8 @@ Controls automatic cleanup of the memory pool. This option is only effective whe

### GGML_CANN_WEIGHT_NZ

Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.

### GGML_CANN_DISABLE_ACL_GRAPH
### GGML_CANN_ACL_GRAPH

When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.
Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1956,7 +1956,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
aclTensor* acl_weight_tensor;

// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (weight_to_nz && is_matmul_weight(weight)) {
int64_t acl_stride[2] = {1, transpose_ne[1]};

Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cann/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ struct ggml_backend_cann_context {
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
device, async_mode ? "ON" : "OFF");
#ifdef USE_ACL_GRAPH
acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or("")));
acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
__func__, device,
acl_graph_mode ? "GRAPH" : "EAGER",
Expand Down
26 changes: 21 additions & 5 deletions ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1196,7 +1196,7 @@ static void ggml_backend_cann_buffer_set_tensor(
// Why aclrtSynchronizeDevice?

// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
ACL_MEMCPY_HOST_TO_DEVICE));
Expand Down Expand Up @@ -1279,6 +1279,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
ACL_MEMCPY_DEVICE_TO_DEVICE));
return true;
} else {
#ifdef ASCEND_310P
// TODO: Support 310p P2P copy
return false;
#endif
// Different device but can access by peer.
int32_t canAccessPeer = 0;
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
Expand Down Expand Up @@ -1439,7 +1443,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
int64_t ne0 = tensor->ne[0];

// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));

// last line must bigger than 32, because every single op deal at
// least 32 bytes.
Expand Down Expand Up @@ -2000,6 +2004,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
ggml_backend_is_cann(backend_dst));

GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));

if (!ggml_backend_buffer_is_cann(src->buffer) ||
!ggml_backend_buffer_is_cann(dst->buffer)) {
return false;
Expand All @@ -2020,6 +2026,10 @@ static bool ggml_backend_cann_cpy_tensor_async(
return true;
}
if (backend_src != backend_dst) {
#ifdef ASCEND_310P
// TODO: Support 310p P2P copy
return false;
#endif
ggml_backend_cann_buffer_context* buf_ctx_src =
(ggml_backend_cann_buffer_context*)buf_src->context;
ggml_backend_cann_buffer_context* buf_ctx_dst =
Expand All @@ -2036,7 +2046,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
}

// need open both directions for memcpyasync between devices.
ggml_cann_set_device(cann_ctx_dst->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
ggml_cann_set_device(cann_ctx_src->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
Expand All @@ -2047,8 +2056,15 @@ static bool ggml_backend_cann_cpy_tensor_async(
ACL_MEMCPY_DEVICE_TO_DEVICE,
cann_ctx_src->stream()));

//TODO: workaround for Event didn`t work here.
aclrtSynchronizeStream(cann_ctx_src->stream());
// record event on src stream after the copy
if (!cann_ctx_src->copy_event) {
ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
}
ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));

// wait on dst stream for the copy to complete
ggml_cann_set_device(cann_ctx_dst->device);
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
} else {
// src and dst are on the same backend
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
Expand Down