-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[MLIR][XeGPU]Enhance Pack/Unpack for XeGPUUnroll #163459
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -246,11 +246,30 @@ xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, | |
if (!computeShapeRatio(srcShape, shape)) | ||
return {value}; | ||
|
||
int64_t srcShapeRank = srcShape.size(); | ||
int64_t targetShapeRank = shape.size(); | ||
|
||
SmallVector<int64_t> adjustedTargetShape(srcShape.size()); | ||
int64_t rankDiff = srcShapeRank - targetShapeRank; | ||
std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff, | ||
1); | ||
std::copy(shape.begin(), shape.end(), adjustedTargetShape.begin() + rankDiff); | ||
|
||
int64_t adjustedTargetShapeRank = adjustedTargetShape.size(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like just srcShapeRank can be used |
||
|
||
SmallVector<Value> result; | ||
for (SmallVector<int64_t> offsets : StaticTileOffsetRange(srcShape, shape)) { | ||
for (SmallVector<int64_t> offsets : | ||
StaticTileOffsetRange(srcShape, adjustedTargetShape)) { | ||
SmallVector<int64_t> staticStrides(offsets.size(), 1); | ||
result.push_back(vector::ExtractStridedSliceOp::create( | ||
builder, loc, value, offsets, shape, staticStrides)); | ||
Value slice = vector::ExtractStridedSliceOp::create( | ||
builder, loc, value, offsets, adjustedTargetShape, staticStrides); | ||
|
||
// Reshape to remove leading unit dims if needed | ||
if (adjustedTargetShapeRank > targetShapeRank) { | ||
auto targetTy = VectorType::get(shape, vecTy.getElementType()); | ||
slice = builder.create<vector::ShapeCastOp>(loc, targetTy, slice); | ||
} | ||
result.push_back(slice); | ||
} | ||
|
||
return result; | ||
|
@@ -274,7 +293,7 @@ Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc, | |
|
||
for (auto [src, offsets] : | ||
llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) { | ||
SmallVector<int64_t> staticStrides(offsets.size(), 1); | ||
SmallVector<int64_t> staticStrides(tileShape.size(), 1); | ||
result = vector::InsertStridedSliceOp::create(builder, loc, src, result, | ||
offsets, staticStrides); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -682,3 +682,73 @@ gpu.module @test_kernel { | |
gpu.return | ||
} | ||
} | ||
|
||
// ----- | ||
gpu.module @test_kernel { | ||
// CHECK-LABEL: load_gather | ||
// CHECK-SAME: [[arg0:%.+]]: ui64 | ||
// CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x1x32xf32> | ||
// CHECK: [[cst_0:%.+]] = arith.constant dense<true> : vector<16xi1> | ||
// CHECK: [[cst_1:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> | ||
// CHECK: [[cst_2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> | ||
// CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> | ||
// CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> | ||
// CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x32xf32> | ||
// CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1]} : vector<16xf32> into vector<1x1x32xf32> | ||
gpu.func @load_gather(%src: ui64) -> vector<1x1x32xf32> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it supposed to check unit dims removing? If yes, let's reflect this in the test name |
||
%cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<[[ | ||
[0, 8, 16, 24, 32, 40, 48, 56, | ||
64, 72, 80, 88, 96, 104, 112, 120, | ||
128, 136, 144, 152, 160, 168, 176, 184, | ||
192, 200, 208, 216, 224, 232, 240, 248] | ||
]]> : vector<1x1x32xindex> | ||
|
||
%mask = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<true> : vector<1x1x32xi1> | ||
%ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please check how the input is being extracted from %mask and %cst (the additional shape cast)? why not add a test with inst_data attached to %cst and %mask and see how they are unrolled all together? |
||
|
||
gpu.return %ld : vector<1x1x32xf32> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please check the code sequence how the value is composed back to 1x1x32? |
||
} | ||
} | ||
|
||
// ----- | ||
#l = #xegpu.layout<inst_data = [1, 16]> | ||
gpu.module @test_kernel { | ||
// CHECK-LABEL: load_store_nd_with_offsets | ||
// CHECK-SAME: [[arg0:%.+]]: memref<1024x1024xf32>, [[arg1:%.+]]: memref<1024x1024xf32>, [[arg2:%.+]]: memref<1024x1024xf32> | ||
// CHECK-DAG: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x32xf32> | ||
// CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index | ||
// CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index | ||
// CHECK: [[tdesc_a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32> | ||
// CHECK: [[tdesc_b:%.+]] = xegpu.create_nd_tdesc [[arg1]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32> | ||
// CHECK: [[tdesc_c:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32> | ||
// CHECK: [[ld_a0:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32> | ||
// CHECK: [[ld_a1:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32> | ||
// CHECK: [[ld_b0:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c0]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32> | ||
// CHECK: [[ld_b1:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c16]]] : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32> | ||
// CHECK: [[cast_a0:%.+]] = vector.shape_cast [[ld_a0]] : vector<1x16xf32> to vector<16xf32> | ||
// CHECK: [[cast_b0:%.+]] = vector.shape_cast [[ld_b0]] : vector<1x16xf32> to vector<16xf32> | ||
// CHECK: [[add0:%.+]] = arith.addf [[cast_a0]], [[cast_b0]] : vector<16xf32> | ||
// CHECK: [[ins0:%.+]] = vector.insert_strided_slice [[add0]], [[cst]] {offsets = [0, 0], strides = [1]} : vector<16xf32> into vector<1x32xf32> | ||
// CHECK: [[cast_a1:%.+]] = vector.shape_cast [[ld_a1]] : vector<1x16xf32> to vector<16xf32> | ||
// CHECK: [[cast_b1:%.+]] = vector.shape_cast [[ld_b1]] : vector<1x16xf32> to vector<16xf32> | ||
// CHECK: [[add1:%.+]] = arith.addf [[cast_a1]], [[cast_b1]] : vector<16xf32> | ||
// CHECK: [[ins1:%.+]] = vector.insert_strided_slice [[add1]], [[ins0]] {offsets = [0, 16], strides = [1]} : vector<16xf32> into vector<1x32xf32> | ||
// CHECK: [[ext0:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32> | ||
// CHECK: [[ext1:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 16], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32> | ||
// CHECK: xegpu.store_nd [[ext0]], [[tdesc_c]][[[c0]], [[c0]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32> | ||
// CHECK: xegpu.store_nd [[ext1]], [[tdesc_c]][[[c0]], [[c16]]] : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32> | ||
gpu.func @load_store_nd_with_offsets(%A: memref<1024x1024xf32>, %B: memref<1024x1024xf32>, %C: memref<1024x1024xf32>) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please also mention in the description that we add the test for new nd_load/store ops syntax |
||
%c0 = arith.constant 0 : index | ||
|
||
%a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l> | ||
%b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l> | ||
%c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l> | ||
|
||
%a = xegpu.load_nd %a_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> | ||
%b = xegpu.load_nd %b_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32> | ||
|
||
%result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32> | ||
xegpu.store_nd %result, %c_tdesc[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l> | ||
gpu.return | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you please elaborate a bit regarding unit dimensions, some short example perhaps