@@ -80,9 +80,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
80
80
information e.g., memref<?x?xf16>, the strides information has to be explicitly
81
81
passed via the "strides" and "const_strides" argument.
82
82
83
- In SIMT mode, tensor descriptor is augmented with `LayoutAttr` which describes the
84
- mapping of the tensor descriptor to the work items.
85
-
86
83
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
87
84
```mlir
88
85
%0 = memref.alloc() : memref<1024x1024xf32>
@@ -106,15 +103,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
106
103
%c1 = arith.constant 1 : index
107
104
%1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
108
105
```
109
-
110
- Example 4 (SIMT mode):
111
- ```mlir
112
- %0 = memref.alloc() : memref<1024x1024xf32>
113
- %c0 = arith.constant 0 : index
114
- %c1 = arith.constant 8 : index
115
- %1 = xegpu.create_nd_tdesc %0[%c0, %c0] : memref<1024x1024xf32>
116
- -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
117
- ```
118
106
}];
119
107
120
108
let arguments = (ins
@@ -301,9 +289,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
301
289
fp32 or fp64. It implies that vnni and transpose cannot exit at the
302
290
same time.
303
291
304
- In SIMT mode, LoadNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
305
- which describes the mapping of the tensor to the work items. In this case, result
306
- vector represents the data to be loaded by each work-item.
292
+ In SIMT mode, result vector represents the data to be loaded by each work-item.
307
293
308
294
Example 1:
309
295
```mlir
@@ -317,8 +303,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
317
303
```mlir
318
304
xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>,
319
305
l2_hint = #xegpu.cache_hint<uncached>}>
320
- : !xegpu.tensor_desc<8x16xf32,
321
- #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x1xf32>
306
+ : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
322
307
```
323
308
324
309
@@ -359,9 +344,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
359
344
of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
360
345
Corresponding cache hint attribute will be masked.
361
346
362
- In SIMT mode, StoreNdOp expects the tensor descriptor to be augmented with `LayoutAttr`
363
- which describes the mapping of the tensor to the work items. In this case, input
364
- vector represents the data to be stored by each work-item.
347
+ In SIMT mode, the input vector represents the data to be stored by each work-item.
365
348
366
349
Example 1:
367
350
```mlir
@@ -375,8 +358,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
375
358
xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
376
359
l2_hint = #xegpu.cache_hint<write_back>,
377
360
l3_hint = #xegpu.cache_hint<write_through>}
378
- : vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16,
379
- #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
361
+ : vector<8xf16>, !xegpu.tensor_desc<8x16xf16>
380
362
```
381
363
382
364
@@ -410,15 +392,10 @@ def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
410
392
The offsets are relative offset to the current position in the number
411
393
of elements. It will result in a same type TensorDesc as the input.
412
394
413
- Example 1 :
395
+ Example:
414
396
```
415
397
%2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
416
398
```
417
- Example 2 (SIMT mode):
418
- ```
419
- %2 = xegpu.update_nd_offset %1, [0, 16]:
420
- !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
421
- ```
422
399
}];
423
400
424
401
let arguments = (ins
@@ -476,11 +453,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
476
453
match the dimension of offsets. It may also has a second dimension corresponding to
477
454
the chunk_size if the chunk size is larger than 1.
478
455
479
- In SIMT mode, similar to `create_nd_tdesc` the resulting tensor descriptor is augmented
480
- with `LayoutAttr` which describes the mapping of the tensor descriptor to the work items.
481
- In this case, the first dimension of the tensor descriptor represents the work-items, and
482
- the second dimension represents the chunk size.
483
-
484
456
Example 1: It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
485
457
```mlir
486
458
%a = memref.alloc() : memref<1024xf32>
@@ -505,15 +477,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
505
477
%1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
506
478
-> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
507
479
```
508
-
509
- Example 4: SIMT mode
510
- ```mlir
511
- %0 = memref.alloc() : memref<1024xf32>
512
- %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
513
- %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
514
- -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>,
515
- #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
516
- ```
517
480
}];
518
481
519
482
let arguments = (ins XeGPU_BaseAddrType: $source,
@@ -609,54 +572,44 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
609
572
let description = [{ It (aka. load) load data per each work-item. The output
610
573
describes the data being loaded at the subgroup level, so its size is
611
574
consistent with the number of work-items in a subgroup. When the chunk size
612
- is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
613
- to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
614
- Specially, there is a transpose effect on the result (as compared to the TensorDesc)
615
- due to the hardware implementation. Therefore, a transpose attribute is introduced
616
- on purpose, making sure users are aware of this implicit transformation.
617
-
575
+ is larger than 2, the output vector is a 2D vector, with dim-0 correspoding
576
+ to work-items, and dim-1 corresponding to the chunk size loaded by each work-item.
618
577
The mask operand masks out memory access so that it is safe to pass out-of-boundary
619
578
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
620
579
621
- In SIMT mode, LoadGatherOp expects the tensor descriptor to be augmented with `LayoutAttr`
622
- which describes the mapping of the tensor to the work items. In this case, result vector
623
- represents the data to be loaded by each work-item. Each work-item recieves a `chunk_size`
624
- number of elements.
580
+ In SIMT mode, the result vector represents the data to be loaded by each work-item.
581
+ Each work-item recieves a `chunk_size` number of elements.
625
582
626
583
Example 1:
627
584
```mlir
628
- %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
629
- l2_hint = #xegpu.cache_hint<uncached>,
630
- l3_hint = #xegpu.cache_hint<uncached>}
585
+ %2 = xegpu.load %1, %0 < {l1_hint = #xegpu.cache_hint<cached>,
586
+ l2_hint = #xegpu.cache_hint<uncached>,
587
+ l3_hint = #xegpu.cache_hint<uncached>}>
631
588
: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
632
589
vector<16xi1> -> vector<16xf32>
633
590
```
634
591
635
592
Example 2:
636
593
```mlir
637
- %2 = xegpu.load %1, %0 {transpose,
638
- l1_hint = #xegpu.cache_hint<cached>,
639
- l2_hint = #xegpu.cache_hint<uncached>,
640
- l3_hint = #xegpu.cache_hint<uncached>}
594
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
595
+ l2_hint = #xegpu.cache_hint<uncached>,
596
+ l3_hint = #xegpu.cache_hint<uncached>}>
641
597
: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
642
- vector<16xi1> -> vector<8x16xf32 >
598
+ vector<16xi1> -> vector<16x8xf32 >
643
599
```
644
600
Example 3 (SIMT mode):
645
601
```mlir
646
- %2 = xegpu.load %1, %0 {transpose,
647
- l1_hint = #xegpu.cache_hint<cached>,
648
- l2_hint = #xegpu.cache_hint<uncached>,
649
- l3_hint = #xegpu.cache_hint<uncached>}
650
- : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>,
651
- !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
652
- vector<16xi1> -> vector<8x1xf32>
602
+ %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>,
603
+ l2_hint = #xegpu.cache_hint<uncached>,
604
+ l3_hint = #xegpu.cache_hint<uncached>}>
605
+ : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>
606
+ vector<16xi1> -> vector<8xf32>
653
607
```
654
608
655
609
}];
656
610
657
611
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
658
612
XeGPU_MaskType: $mask,
659
- OptionalAttr<UnitAttr>: $transpose,
660
613
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
661
614
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
662
615
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -699,44 +652,38 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [
699
652
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
700
653
introduced on purpose, making sure users are aware of this implicit transformation.
701
654
702
- In SIMT mode, StoreScatterOp expects the tensor descriptor to be augmented with `LayoutAttr`
703
- which describes the mapping of the tensor to the work items. In this case, input vector
704
- represents the data to be stored by each work-item. Each work-item recieves a `chunk_size`
705
- number of elements.
655
+ In SIMT mode, the input vector represents the data to be stored by each work-item.
656
+ Each work-item stores a `chunk_size` number of elements.
706
657
707
658
Example 1:
708
659
```mlir
709
- xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
710
- l2_hint = #xegpu.cache_hint<write_back>,
711
- l3_hint = #xegpu.cache_hint<write_through>}
660
+ xegpu.store %0, %1, %2 < {l1_hint = #xegpu.cache_hint<uncached>,
661
+ l2_hint = #xegpu.cache_hint<write_back>,
662
+ l3_hint = #xegpu.cache_hint<write_through>}>
712
663
: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
713
664
```
714
665
715
666
Example 2:
716
667
```mlir
717
- xegpu.store %0, %1, %2 {transpose,
718
- l1_hint = #xegpu.cache_hint<uncached>,
719
- l2_hint = #xegpu.cache_hint<write_back>,
720
- l3_hint = #xegpu.cache_hint<write_through>}
721
- : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
668
+ xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
669
+ l2_hint = #xegpu.cache_hint<write_back>,
670
+ l3_hint = #xegpu.cache_hint<write_through>}>
671
+ : vector<16x8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
722
672
```
673
+
723
674
Example 3 (SIMT mode):
724
675
```mlir
725
- xegpu.store %0, %1, %2 {transpose,
726
- l1_hint = #xegpu.cache_hint<uncached>,
727
- l2_hint = #xegpu.cache_hint<write_back>,
728
- l3_hint = #xegpu.cache_hint<write_through>}
729
- : vector<8x1xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>,
730
- !xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> vector<16xi1>
676
+ xegpu.store %0, %1, %2 <{l1_hint = #xegpu.cache_hint<uncached>,
677
+ l2_hint = #xegpu.cache_hint<write_back>,
678
+ l3_hint = #xegpu.cache_hint<write_through>}>
679
+ : vector<8xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>> vector<16xi1>
731
680
```
732
-
733
681
}];
734
682
735
683
let arguments = (ins
736
684
XeGPU_ValueType: $value,
737
685
XeGPU_TensorDesc: $TensorDesc,
738
686
XeGPU_MaskType: $mask,
739
- OptionalAttr<UnitAttr>: $transpose,
740
687
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
741
688
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
742
689
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -773,20 +720,13 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
773
720
update the offset per work-item, so its offsets contains values representing
774
721
shifts for each work-item.
775
722
776
- Example 1 :
723
+ Example:
777
724
```mlir
778
725
%off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
779
726
%2 = xegpu.update_offset %1, %off :
780
727
!xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>>, vector<4xindex>
781
728
```
782
729
783
- Example 2 (SIMT mode):
784
- ```mlir
785
- %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
786
- %2 = xegpu.update_offset %1, %off :
787
- !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<chunk_size=2>,
788
- #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
789
- ```
790
730
}];
791
731
792
732
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
0 commit comments