diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index cbef637be213a..123757bfba7b0 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -257,7 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*v*3IUiIiIUi", "t", "vmem-to-lds-load-insts") //===----------------------------------------------------------------------===// // Deep learning builtins. diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index b56b739094ff3..92e593e6f8648 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -574,6 +574,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); return Builder.CreateCall(F, {Addr}); } + case AMDGPU::BI__builtin_amdgcn_global_load_lds: { + return emitBuiltinWithOneOverloadedOperand<5>( + *this, E, Intrinsic::amdgcn_global_load_lds); + } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl index 8251d6c213e3d..35f2eae1c1202 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -1774,7 +1774,7 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed) // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0) +// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0) // CHECK-NEXT: ret void // void test_global_load_lds_96(global void* src, local void *dst) { @@ -1789,7 +1789,7 @@ void test_global_load_lds_96(global void* src, local void *dst) { // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0) +// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0) // CHECK-NEXT: ret void // void test_global_load_lds_128(global void* src, local void *dst) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl index 62c8deb6e4a89..5ff544f19de1b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-lds.cl @@ -18,7 +18,7 @@ typedef unsigned char u8; // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0) +// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0) // CHECK-NEXT: ret void // void test_global_load_lds_u32(global u32* src, local u32 *dst) { @@ -35,7 +35,7 @@ void test_global_load_lds_u32(global u32* src, local u32 *dst) { // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0) +// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0) // CHECK-NEXT: ret void // void test_global_load_lds_u16(global u16* src, local u16 *dst) { @@ -52,7 +52,7 @@ void test_global_load_lds_u16(global u16* src, local u16 *dst) { // CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8 // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4 -// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0) +// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0) // CHECK-NEXT: ret void // void test_global_load_lds_u8(global u8* src, local u8 *dst) { diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 217e43fcce4fd..fc6dac5dc99fc 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2624,17 +2624,20 @@ def int_amdgcn_perm : // GFX9 Intrinsics //===----------------------------------------------------------------------===// +// Intrinsic for loading data from a global-memory pointer to LDS +// Also supports buffer fat pointers. class AMDGPUGlobalLoadLDS : ClangBuiltin<"__builtin_amdgcn_global_load_lds">, Intrinsic < [], - [LLVMQualPointerType<1>, // Base global pointer to load from - LLVMQualPointerType<3>, // LDS base pointer to store to + [llvm_anyptr_ty, // Global or buffer fat pointer to load from (per-lane) + LLVMQualPointerType<3>, // LDS base pointer to store to (uniform) llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) llvm_i32_ty, // imm offset (applied to both global and LDS address) llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, // bit 1 = sc1, // bit 4 = scc)) + // See raw_ptr_buffer_load_lds for semantics on ptr addrspace(7) [IntrWillReturn, NoCapture>, NoCapture>, ImmArg>, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 766a4ea250942..f8b3c122d75ab 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { case Intrinsic::memset: case Intrinsic::memset_inline: case Intrinsic::experimental_memset_pattern: + case Intrinsic::amdgcn_global_load_lds: return true; } } @@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { SplitUsers.insert(&I); return {NewRsrc, Off}; } + case Intrinsic::amdgcn_global_load_lds: { + Value *Ptr = I.getArgOperand(0); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + auto [Rsrc, Off] = getPtrParts(Ptr); + Value *LDSPtr = I.getArgOperand(1); + Value *LoadSize = I.getArgOperand(2); + Value *ImmOff = I.getArgOperand(3); + Value *Aux = I.getArgOperand(4); + Value *SOffset = IRB.getInt32(0); + Instruction *NewLoad = IRB.CreateIntrinsic( + Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {}, + {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux}); + copyMetadata(NewLoad, &I); + SplitUsers.insert(&I); + I.replaceAllUsesWith(NewLoad); + return {nullptr, nullptr}; + } } return {nullptr, nullptr}; } @@ -2291,7 +2311,10 @@ class AMDGPULowerBufferFatPointers : public ModulePass { public: static char ID; - AMDGPULowerBufferFatPointers() : ModulePass(ID) {} + AMDGPULowerBufferFatPointers() : ModulePass(ID) { + initializeAMDGPULowerBufferFatPointersPass( + *PassRegistry::getPassRegistry()); + } bool run(Module &M, const TargetMachine &TM); bool runOnModule(Module &M) override; diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll index ee51b0b84554e..75175955b313f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll @@ -1724,3 +1724,21 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false) ret void } + +;;; Buffer load to LDS + +declare void @llvm.amdgcn.global.load.lds.p7(ptr addrspace(7), ptr addrspace(3), i32 immarg, i32 immarg, i32 immarg) + +define void @llvm_amdgcn_global_load_lds(ptr addrspace(7) inreg %p, ptr addrspace(3) inreg %l, i32 %idx) {; +; CHECK-LABEL: define void @llvm_amdgcn_global_load_lds( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[P:%.*]], ptr addrspace(3) inreg [[L:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 +; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 +; CHECK-NEXT: [[Q:%.*]] = add i32 [[P_OFF]], [[IDX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[P_RSRC]], ptr addrspace(3) [[L]], i32 4, i32 [[Q]], i32 0, i32 16, i32 0) +; CHECK-NEXT: ret void +; + %q = getelementptr i8, ptr addrspace(7) %p, i32 %idx + call void @llvm.amdgcn.global.load.lds(ptr addrspace(7) %q, ptr addrspace(3) %l, i32 4, i32 16, i32 0) + ret void +} diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 900155c274b4d..6c2e6c7a6aac5 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -447,14 +447,17 @@ def ROCDL_ds_read_tr16_b64 : ROCDL_LDS_Read_Tr_IntrOp<"ds.read.tr16.b64">; // Global load to LDS intrinsic (available in GFX950) def ROCDL_GlobalLoadLDSOp : - ROCDL_IntrOp<"global.load.lds", [], [], [], 0, 0, 1> { - dag args = (ins Arg:$globalPtr, + ROCDL_IntrOp<"global.load.lds", [], [0], [], 0, 0, 1, [2, 3, 4], ["size", "offset", "aux"]> { + dag args = (ins Arg:$globalPtr, Arg:$ldsPtr, - I32:$size, - I32:$offset, - I32:$aux); + I32Attr:$size, + I32Attr:$offset, + I32Attr:$aux); let arguments = !con(args, aliasAttrs); - let assemblyFormat = "operands attr-dict"; + let assemblyFormat = [{ + $globalPtr `,` $ldsPtr `,` $size `,` $offset `,` $aux + attr-dict `:` type($globalPtr) + }]; let extraClassDefinition = [{ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { return {getGlobalPtr(), getLdsPtr()}; diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 5f697bdeef566..a344e3dfcc131 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1050,9 +1050,9 @@ struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern { (adaptor.getDstIndices()), rewriter); rewriter.replaceOpWithNewOp( - op, srcPtr, dstPtr, createI32Constant(rewriter, loc, loadWidth), - createI32Constant(rewriter, loc, 0), - createI32Constant(rewriter, loc, 0), ArrayAttr{}, ArrayAttr{}, + op, srcPtr, dstPtr, rewriter.getI32IntegerAttr(loadWidth), + /*offset=*/rewriter.getI32IntegerAttr(0), + /*aux=*/rewriter.getI32IntegerAttr(0), ArrayAttr{}, ArrayAttr{}, ArrayAttr{}); return success(); diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir index b1c16bd5db079..326688c0801d6 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir @@ -21,8 +21,8 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add // CHECK: %[[ALLOC:.*]] = memref.alloc() // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast - // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] - + // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] + // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64 // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64 // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64 @@ -35,8 +35,7 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] - // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]] + // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4 amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] : f32, memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace> func.return @@ -56,8 +55,8 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs // CHECK: %[[ALLOC:.*]] = memref.alloc() // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] - // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] - + // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] + // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64 // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64 // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64 @@ -70,8 +69,7 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] - // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C1]] + // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 1 %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c32 = arith.constant 32 : index @@ -85,7 +83,7 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs // CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi16, 1>) func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_addrspace>) { // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] - + // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64 // CHECK: %[[C12:.*]] = arith.constant 12 : index @@ -95,8 +93,8 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add // CHECK: %[[ALLOC:.*]] = memref.alloc() // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] - // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] - + // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] + // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64 // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64 // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64 @@ -109,8 +107,7 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64 // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] - // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]] + // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4 %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c32 = arith.constant 32 : index @@ -129,12 +126,11 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[ALLOC:.*]] = memref.alloc() // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] - // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] + // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]] // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1] // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX_CAST]]] - // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]] + // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4 %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace> %c0 = arith.constant 0 : index amdgpu.gather_to_lds %global[%src_idx], %alloc[%dst_idx, %c0] diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index 828fd58544597..58276fc10147e 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -637,12 +637,8 @@ llvm.func @rocdl.ds.read.tr(%ptr : !llvm.ptr<3>) -> vector<4xf16> { } llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { - %aux = llvm.mlir.constant(0 : i32) : i32 - %offset = llvm.mlir.constant(0 : i32) : i32 - %size = llvm.mlir.constant(10 : i32) : i32 - - //CHECK: rocdl.global.load.lds %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} - rocdl.global.load.lds %src, %dst, %size, %offset, %aux + //CHECK: rocdl.global.load.lds %{{.*}}, %{{.*}}, 4, 0, 0 : <1> + rocdl.global.load.lds %src, %dst, 4, 0, 0 : <1> llvm.return } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index e70617bfff99e..0e46edcae5784 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -842,11 +842,8 @@ llvm.func @rocdl.ds.read.tr(%ptr : !llvm.ptr<3>) -> vector<4xf16> { } llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { - %aux = llvm.mlir.constant(0 : i32) : i32 - %offset = llvm.mlir.constant(0 : i32) : i32 - %size = llvm.mlir.constant(10 : i32) : i32 - //CHECK: call void @llvm.amdgcn.global.load.lds - rocdl.global.load.lds %src, %dst, %size, %offset, %aux + //CHECK: call void @llvm.amdgcn.global.load.lds.p1 + rocdl.global.load.lds %src, %dst, 4, 0, 0 : !llvm.ptr<1> llvm.return }