@@ -699,6 +699,15 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
699699 ctx->tensor_extras .push_back (extra);
700700
701701 for (int id = 0 ; id < ggml_backend_cuda_get_device_count (); ++id) {
702+ ggml_cuda_set_device (id);
703+
704+ // Create events on all devices unconditionally even if they don't actually hold any data.
705+ // This is because for very small matrices it's possible for the active device to not hold any data.
706+ // But in this case the events are still needed to synchronize the other devices.
707+ for (int64_t is = 0 ; is < GGML_CUDA_MAX_STREAMS; ++is) {
708+ CUDA_CHECK (cudaEventCreateWithFlags (&extra->events [id][is], cudaEventDisableTiming));
709+ }
710+
702711 int64_t row_low, row_high;
703712 get_row_split (&row_low, &row_high, tensor, buft_ctx->tensor_split , id);
704713
@@ -717,7 +726,6 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
717726
718727 // FIXME: do not crash if cudaMalloc fails
719728 // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
720- ggml_cuda_set_device (id);
721729 char * buf;
722730 CUDA_CHECK (ggml_cuda_device_malloc ((void **)&buf, size, id));
723731
@@ -727,10 +735,6 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
727735 }
728736
729737 extra->data_device [id] = buf;
730-
731- for (int64_t is = 0 ; is < GGML_CUDA_MAX_STREAMS; ++is) {
732- CUDA_CHECK (cudaEventCreateWithFlags (&extra->events [id][is], cudaEventDisableTiming));
733- }
734738 }
735739 tensor->extra = extra;
736740}
0 commit comments