From 244e3c906f2c57a1a3dd3c8fe0459e1bd045b233 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 11 Jun 2025 15:32:25 +0000 Subject: [PATCH 1/2] [memcpyopt] allow some undef contents overread in processMemCpyMemCpyDependence --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 53 +++++++++++++------ .../MemCpyOpt/memcpy-memcpy-offset.ll | 31 ++++++++--- .../MemCpyOpt/variable-sized-memcpy-memcpy.ll | 37 ++++++++++++- 3 files changed, 97 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 1c4ec6aa08b43..9ab8d645e5911 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -107,6 +107,9 @@ struct MemsetRange { } // end anonymous namespace +static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy, + MemIntrinsic *MemSrc, BatchAAResults &BAA); + bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If we found more than 4 stores to merge or 16 bytes, use memset. if (TheStores.size() >= 4 || End - Start >= 16) @@ -1129,14 +1132,28 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, MForwardOffset = *Offset; } - // The length of the memcpy's must be the same, or the preceding one - // must be larger than the following one. - if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) { + Value *CopyLength = M->getLength(); + + // The length of the memcpy's must be the same, or the preceding one must be + // larger than the following one, or the contents of the overread must be + // undefined bytes of a defined size. + if (MForwardOffset != 0 || MDep->getLength() != CopyLength) { auto *MDepLen = dyn_cast(MDep->getLength()); - auto *MLen = dyn_cast(M->getLength()); - if (!MDepLen || !MLen || - MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) - return false; + auto *MLen = dyn_cast(CopyLength); + if (!MDepLen || !MLen) + return false; // This could be converted to a runtime test (%CopyLength = + // min(max(0, MDepLen - MForwardOffset), MLen)), but it is + // unclear if that is useful + if (MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) { + if (!overreadUndefContents(MSSA, M, MDep, BAA)) + return false; + if (MDepLen->getZExtValue() <= (uint64_t)MForwardOffset) + return false; // Should not reach here (there is obviously no aliasing + // with MDep), so just bail in case it had incomplete info + // somehow + CopyLength = ConstantInt::get(CopyLength->getType(), + MDepLen->getZExtValue() - MForwardOffset); + } } IRBuilder<> Builder(M); @@ -1152,9 +1169,13 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, eraseInstruction(NewCopySource); }); MaybeAlign CopySourceAlign = MDep->getSourceAlign(); - // We just need to calculate the actual size of the copy. - auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize( - MemoryLocation::getForSource(M).Size); + auto MCopyLoc = MemoryLocation::getForSource(MDep); + // Truncate the size of the MDep access to just the bytes read + if (MDep->getLength() != CopyLength) { + auto ConstLength = cast(CopyLength); + MCopyLoc = MCopyLoc.getWithNewSize( + LocationSize::precise(ConstLength->getZExtValue())); + } // When the forwarding offset is greater than 0, we transform // memcpy(d1 <- s1) @@ -1223,20 +1244,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // example we could be moving from movaps -> movq on x86. Instruction *NewM; if (UseMemMove) - NewM = - Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource, - CopySourceAlign, M->getLength(), M->isVolatile()); + NewM = Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource, + CopySourceAlign, CopyLength, M->isVolatile()); else if (M->isForceInlined()) // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is // never allowed since that would allow the latter to be lowered as a call // to an external function. NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(), - CopySource, CopySourceAlign, - M->getLength(), M->isVolatile()); + CopySource, CopySourceAlign, CopyLength, + M->isVolatile()); else NewM = Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource, - CopySourceAlign, M->getLength(), - M->isVolatile()); + CopySourceAlign, CopyLength, M->isVolatile()); NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID); diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll index da654438d7bd6..c5406c5c64340 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -135,13 +135,14 @@ define void @forward_offset_memcpy_inline(ptr %src, ptr %dest) { } ; We cannot forward `memcpy` because it exceeds the size of `memcpy` it depends on. -define void @do_not_forward_oversize_offset(ptr %src, ptr %dest) { -; CHECK-LABEL: define void @do_not_forward_oversize_offset( +define void @forward_oversize_offset(ptr %src, ptr %dest) { +; CHECK-LABEL: define void @forward_oversize_offset( ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { -; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 6, i1 false) -; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP_OFFSET]], i64 6, i1 false) +; CHECK-NEXT: [[CPY_TMP:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[CPY_TMP]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: [[CPY_TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[CPY_TMP]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 5, i1 false) ; CHECK-NEXT: ret void ; %cpy_tmp = alloca %buf, align 1 @@ -214,6 +215,24 @@ define void @pr98675(ptr noalias %p1, ptr noalias %p2) { ret void } +define void @over_offset_cpy(ptr %src) { +; CHECK-LABEL: define void @over_offset_cpy( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 2, align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 1, align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC]], i64 1, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1 +; CHECK-NEXT: ret void +; + %tmp = alloca i8, i64 2 + %dst = alloca i8, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false) + %tmp_offset = getelementptr inbounds i8, ptr %tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp_offset, i64 1, i1 false) + + ret void +} + declare void @use(ptr) declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll index 4f6b734ec057d..95402a8ea686c 100644 --- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll @@ -18,7 +18,42 @@ define void @test(ptr %src, i64 %size) { ret void } -; Differing sizes, so left as it is. +define void @dynalloca_test(ptr %src, i64 %size1) { +; CHECK-LABEL: @dynalloca_test( +; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 [[SIZE1]], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC:%.*]], i64 31, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST]], ptr align 8 [[SRC]], i64 31, i1 false) +; CHECK-NEXT: ret void +; + %tmp = alloca i8, i64 %size1 + %dst = alloca i8, i64 %size1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 31, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 32, i1 false) + + ret void +} + +define void @dynalloca_offset_test(ptr %src, i64 %size1) { +; CHECK-LABEL: @dynalloca_offset_test( +; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 [[SIZE1]], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC:%.*]], i64 31, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST]], ptr align 1 [[TMP1]], i64 30, i1 false) +; CHECK-NEXT: ret void +; + %tmp = alloca i8, i64 %size1 + %dst = alloca i8, i64 %size1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 31, i1 false) + %tmp_offset = getelementptr inbounds i8, ptr %tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp_offset, i64 31, i1 false) + + ret void +} + +; Dynamic sizes, so left as it is. define void @negative_test(ptr %src, i64 %size1, i64 %size2) { ; CHECK-LABEL: @negative_test( ; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 From 72002f67782858e627e7917472ada55c7a31f292 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Thu, 12 Jun 2025 13:39:37 +0000 Subject: [PATCH 2/2] review --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 9 +++++---- llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll | 10 +++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9ab8d645e5911..2b0e221f341e0 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1140,10 +1140,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, if (MForwardOffset != 0 || MDep->getLength() != CopyLength) { auto *MDepLen = dyn_cast(MDep->getLength()); auto *MLen = dyn_cast(CopyLength); + // This could be converted to a runtime test (%CopyLength = + // min(max(0, MDepLen - MForwardOffset), MLen)), but it is + // unclear if that is useful if (!MDepLen || !MLen) - return false; // This could be converted to a runtime test (%CopyLength = - // min(max(0, MDepLen - MForwardOffset), MLen)), but it is - // unclear if that is useful + return false; if (MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) { if (!overreadUndefContents(MSSA, M, MDep, BAA)) return false; @@ -1172,7 +1173,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, auto MCopyLoc = MemoryLocation::getForSource(MDep); // Truncate the size of the MDep access to just the bytes read if (MDep->getLength() != CopyLength) { - auto ConstLength = cast(CopyLength); + auto *ConstLength = cast(CopyLength); MCopyLoc = MCopyLoc.getWithNewSize( LocationSize::precise(ConstLength->getZExtValue())); } diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll index c5406c5c64340..7dc579aad02f2 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -134,7 +134,7 @@ define void @forward_offset_memcpy_inline(ptr %src, ptr %dest) { ret void } -; We cannot forward `memcpy` because it exceeds the size of `memcpy` it depends on. +; We can forward `memcpy` by shrinking it to the size of the `memcpy` it depends on. define void @forward_oversize_offset(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @forward_oversize_offset( ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { @@ -218,14 +218,14 @@ define void @pr98675(ptr noalias %p1, ptr noalias %p2) { define void @over_offset_cpy(ptr %src) { ; CHECK-LABEL: define void @over_offset_cpy( ; CHECK-SAME: ptr [[SRC:%.*]]) { -; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 2, align 1 -; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 1, align 1 +; CHECK-NEXT: [[TMP:%.*]] = alloca [2 x i8], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, align 1 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC]], i64 1, i1 false) ; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1 ; CHECK-NEXT: ret void ; - %tmp = alloca i8, i64 2 - %dst = alloca i8, i64 1 + %tmp = alloca [2 x i8] + %dst = alloca i8 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false) %tmp_offset = getelementptr inbounds i8, ptr %tmp, i64 1 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp_offset, i64 1, i1 false)