struct Foo { int o[200]; }; struct Bar { char p; Foo f; }; __attribute__((noinline)) Foo moo() { return {0}; } void goo(Bar *f) { f->f = moo(); } with -O3 -fno-exceptions -fomit-frame-pointer gives: moo(): # @moo() push rbx mov rbx, rdi xor esi, esi mov edx, 800 call memset mov rax, rbx pop rbx ret goo(Bar*): # @goo(Bar*) push r14 push rbx sub rsp, 808 mov rbx, rdi lea r14, [rsp + 8] mov rdi, r14 call moo() add rbx, 4 mov edx, 800 mov rdi, rbx mov rsi, r14 call memcpy add rsp, 808 pop rbx pop r14 ret in clang vs gcc's: moo(): mov rdx, rdi mov QWORD PTR [rdi], 0 mov QWORD PTR [rdi+792], 0 lea rdi, [rdi+8] mov rcx, rdx xor eax, eax and rdi, -8 sub rcx, rdi add ecx, 800 shr ecx, 3 rep stosq mov rax, rdx ret goo(Bar*): add rdi, 4 call moo() rep ret Notice the elided memcpy in goo()
Dan or Reid, this is the next bug in my "big structs are hard" series. Any ideas on what's going wrong here?
GCC's lowering would be non-conforming if applied in general. Consider what happens in this case: struct Foo { int o[200]; }; struct Bar { char p; Foo f; }; Bar bar; __attribute__((noinline)) Foo moo() { return {1, bar.f.o[0]}; } void goo(Bar *f) { f->f = moo(); } int main() { goo(&bar); return bar.f.o[1]; } Here, under a correct compiler, main() must return 0. But if the optimization is applied to 'goo' in this case, then main() might instead return 1. In order for the optimization to be safe, you need to prove that the destination of the assignment is not visible in the callee. GCC apparently *does* do this; the code generated for the 'goo' function in the above code includes an 800-byte copy, presumably because 'moo' accesses a global that potentially aliases 'f->f' (in fact, GCC seems to be being even more conservative: it disables the optimization if 'moo' accesses *any* global storage that might be visible to 'goo').
One thing I notice is that GCC needs to see the definition of 'moo' in this example, which probably limits the applicability of this optimization. This optimization requires proving that 'moo' doesn't read f->f. LLVM only infers these function attributes, which are not enough: ; Function Attrs: noinline nounwind uwtable define void @moo(%struct.Foo* noalias nocapture sret %agg.result) local_unnamed_addr #0 { If functionattrs could infer argmemonly or writeonly, then maybe memcpyopt could leverage that aliasing information to do this transform.
(In reply to Richard Smith from comment #2) > GCC's lowering would be non-conforming if applied in general. Consider what > happens in this case: My version of GCC doesn't appear to elide the memcpy in this case. It's an optimization that requires some inter-procedural aliasing info.
(In reply to Reid Kleckner from comment #4) > My version of GCC doesn't appear to elide the memcpy in this case. It's an > optimization that requires some inter-procedural aliasing info. Right, I intended to convey that information in the bottom half of my reply; sorry if that wasn't clear.
So I originally reduced this test case from some rust code which has a better aliasing information. The following code will have it's memcpy eliminated by opt -O2: ; ModuleID = 'large_return0-8787f43e282added376259c1adb08b80.rs' source_filename = "large_return0-8787f43e282added376259c1adb08b80.rs" target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-darwin" %Bar = type { [200 x i8], [0 x i8] } ; large_return::moo ; Function Attrs: noinline nounwind define void @_ZN12large_return3moo17h8a52c64dcae995e9E([200 x i8]* noalias nocapture sret dereferenceable(200)) unnamed_addr #0 { start: %1 = getelementptr inbounds [200 x i8], [200 x i8]* %0, i32 0, i32 0 call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 200, i32 1, i1 false) ret void } ; large_return::goo ; Function Attrs: nounwind define void @_ZN12large_return3goo17hdc16192f05aca994E(%Bar* noalias dereferenceable(200) %f) unnamed_addr #1 { start: %_2 = alloca [200 x i8] ; call large_return::moo call void @_ZN12large_return3moo17h8a52c64dcae995e9E([200 x i8]* noalias nocapture sret dereferenceable(200) %_2) br label %bb1 bb1: ; preds = %start %0 = getelementptr inbounds %Bar, %Bar* %f, i32 0, i32 0 %1 = bitcast [200 x i8]* %_2 to i8* %2 = bitcast [200 x i8]* %0 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %1, i64 200, i32 1, i1 false) ret void } ; Function Attrs: argmemonly nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2 ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2 attributes #0 = { noinline nounwind "no-frame-pointer-elim"="true" "probe-stack"="__rust_probestack" } attributes #1 = { nounwind "no-frame-pointer-elim"="true" "probe-stack"="__rust_probestack" } but this code will not: ; ModuleID = 'large_return0-8787f43e282added376259c1adb08b80.rs' source_filename = "large_return0-8787f43e282added376259c1adb08b80.rs" target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-darwin" %Bar = type { i8, [0 x i8], [200 x i8], [0 x i8] } ; large_return::moo ; Function Attrs: noinline nounwind define void @_ZN12large_return3moo17h8a52c64dcae995e9E([200 x i8]* noalias nocapture sret dereferenceable(200)) unnamed_addr #0 { start: %1 = getelementptr inbounds [200 x i8], [200 x i8]* %0, i32 0, i32 0 call void @llvm.memset.p0i8.i64(i8* %1, i8 0, i64 200, i32 1, i1 false) ret void } ; large_return::goo ; Function Attrs: nounwind define void @_ZN12large_return3goo17hdc16192f05aca994E(%Bar* noalias dereferenceable(201) %f) unnamed_addr #1 { start: %_2 = alloca [200 x i8] ; call large_return::moo call void @_ZN12large_return3moo17h8a52c64dcae995e9E([200 x i8]* noalias nocapture sret dereferenceable(200) %_2) br label %bb1 bb1: ; preds = %start %0 = getelementptr inbounds %Bar, %Bar* %f, i32 0, i32 2 %1 = bitcast [200 x i8]* %_2 to i8* %2 = bitcast [200 x i8]* %0 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %1, i64 200, i32 1, i1 false) ret void } ; Function Attrs: argmemonly nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2 ; Function Attrs: argmemonly nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2 attributes #0 = { noinline nounwind "no-frame-pointer-elim"="true" "probe-stack"="__rust_probestack" } attributes #1 = { nounwind "no-frame-pointer-elim"="true" "probe-stack"="__rust_probestack" } attributes #2 = { argmemonly nounwind } the difference is: --- /tmp/large-return-works.ll 2017-11-01 15:46:08.940497981 -0400 +++ /tmp/large-return.ll 2017-11-01 15:50:41.471564461 -0400 @@ -3,7 +3,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-darwin" -%Bar = type { [200 x i8], [0 x i8] } +%Bar = type { i8, [0 x i8], [200 x i8], [0 x i8] } ; large_return::moo ; Function Attrs: noinline nounwind @@ -16,7 +16,7 @@ ; large_return::goo ; Function Attrs: nounwind -define void @_ZN12large_return3goo17hdc16192f05aca994E(%Bar* noalias dereferenceable(200) %f) unnamed_addr #1 { +define void @_ZN12large_return3goo17hdc16192f05aca994E(%Bar* noalias dereferenceable(201) %f) unnamed_addr #1 { start: %_2 = alloca [200 x i8] ; call large_return::moo @@ -24,7 +24,7 @@ br label %bb1 bb1: ; preds = %start - %0 = getelementptr inbounds %Bar, %Bar* %f, i32 0, i32 0 + %0 = getelementptr inbounds %Bar, %Bar* %f, i32 0, i32 2 %1 = bitcast [200 x i8]* %_2 to i8* %2 = bitcast [200 x i8]* %0 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %1, i64 200, i32 1, i1 false)
I have a patch related to this at https://reviews.llvm.org/D40723