Book PDF A4 09802938
Book PDF A4 09802938
Book PDF A4 09802938
net/
StackExchange.helpers.removeParameterFromQueryString("s");
StackExchange.helpers.removeParameterFromQueryString("r");
StackExchange.ready(function () { StackExchange.using("snippets", function () { StackExchange.snippets.initSnippetRenderer(); });
StackExchange.using("postValidation", function () { StackExchange.postValidation.initOnBlurAndSubmit($('#post-form'), 2, 'answer'); });
StackExchange.question.init({showAnswerHelp:true,showTrendingSortLaunchPopover:false,showTrendingSortPostLaunchPopover:false,
styleCode(); StackExchange.realtime.subscribeToQuestion('1', '40354978'); StackExchange.using("gps", function () {
StackExchange.gps.trackOutboundClicks('#content', '.js-post-body'); }); });
{"options":
{"locale":"en","serverTime":1705335874,"routeName":"Questions/Show","stackAuthUrl":"https://stackauth.com","networkMetaHostname"
{"name":"Stack Overflow","deh2ion":"Q\u0026A for professional and enthusiast
programmers","isNoticesTabEnabled":true,"enableNewTagCreationWarning":true,"insertSpaceAfterNameTabCompletion":false,"id":1,"coo
{"fkey":"9f5af8efa3407ca1d4276afcfa6d3aa9c10c137c908c974bed81445da6721fb7","tid":"01658ac1-351d-43c0-873c-
5d8de9b69efb","rep":0,"isAnonymous":true,"isAnonymousNetworkWide":true},"events":{"postType":
{"question":1},"postEditionSection":{"title":1,"body":2,"tags":3}}}}
{"settings":{"userMessaging":{"showNewFeatureNotice":true},"tags":{},"subh2ions":
{"defaultBasicMaxTrueUpSeats":250,"defaultFreemiumMaxTrueUpSeats":50,"defaultMaxTrueUpSeats":1000},"snippets":
{"renderDomain":"stacksnippets.net","snippetsEnabled":true},"site":
{"allowImageUploads":true,"enableImgurHttps":true,"enableUserHovercards":true,"forceHttpsImages":true,"stacksEditorPreviewEnabled"
{},"questions":
{"enableQuestionTitleLengthLiveWarning":true,"enableSavesFeature":true,"maxTitleSize":150,"questionTitleLengthStartLiveWarningChars
{"appId":"inf0secd","enableJavah2ImplementationFor":true,"hostBaseUrl":"https://stacksnippets.net"},"paths":
{"jQueryUICSSPath":"https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.0/themes/smoothness/jquery-
ui.css","jQueryUIJSPath":"https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.0/jquery-ui.min.js"},"mentions":
{"maxNumUsersInDropdown":50},"markdown":{"enableTables":true},"legal":{"oneTrustConfigId":"c3d9f1e3-55f3-4eba-b268-
46cee4c6789c","oneTrustTCFConfigId":"c3d9f1e3-55f3-4eba-b268-46cee4c6789c","useCustomConsent":false},"flags":
{"allowRetractingCommentFlags":true,"allowRetractingFlags":true},"elections":
{"opaVoteResultsBaseUrl":"https://www.opavote.com/results/"},"comments":{},"accounts":
{"currentPasswordRequiredForChangingStackIdPassword":true}}}
StackExchange.init();
StackExchange.using.setCacheBreakers({"Js/adops.en.js":"6da43f5e0a84","Js/ask.en.js":"","Js/begin-edit-
event.en.js":"20edbaccceae","Js/copy-transpiled.en.js":"d31dc7eba3bc","Js/events.en.js":"","Js/explore-
qlist.en.js":"2b1f34938b8b","Js/full-anon.en.js":"8b8d7e191bb2","Js/full.en.js":"42c2664ea548","Js/highlightjs-
loader.en.js":"510e2f94c2bf","Js/inline-tag-editing.en.js":"3e8cc64ee9d6","Js/keyboard-shortcuts.en.js":"107c2ac31497","Js/markdown-
it-loader.en.js":"5818ef89ff9d","Js/mentions-transpiled.en.js":"36b8cabd2c12","Js/moderator.en.js":"9d28c0bec7f8","Js/postCollections-
transpiled.en.js":"ea1228154a76","Js/post-validation.en.js":"8fd6c83cea6c","Js/question-editor.en.js":"","Js/review-v2-
transpiled.en.js":"d8246fd945d5","Js/revisions.en.js":"47b4d5ac24c9","Js/stacks-
editor.en.js":"45ddc00eb16e","Js/tageditor.en.js":"dc13482a67f8","Js/tageditornew.en.js":"51ca342181f3","Js/tagsuggestions.en.js":"bd6e
transpiled.en.js":"f26a1d5f3365","Js/wmd.en.js":"033a0412fcae","Js/snippet-javah2-codemirror.en.js":"ae1dcf38deb7"});
StackExchange.using("gps", function() { StackExchange.gps.init(true); });
Stack Overflow
1. About
2. Products
3. For Teams
Search…
Loading…
1. current community
Stack Overflow
help chat
Meta Stack Overflow
your communities
company blog
2.
3. Log in
4. Sign up
Sign up with
Sign up with email Sign up Sign up with Google GitHub Sign up with Facebook
StackExchange.ready(function () { StackExchange.InlineAuth.init(); })
StackExchange.ready(function () { StackExchange.Hero.init("stk", "a"); var location = 0; if ($("body").hasClass("questions-page")) {
location = 1; } else if ($("body").hasClass("question-page")) { location = 1; } else if ($("body").hasClass("faq-page")) { location = 5; } else if
($("body").hasClass("home-page")) { location = 3; } $('.js-cta-button').click(function () { StackExchange.using("gps", function () {
StackExchange.gps.track("hero.action", { hero_action_type: 'cta', location: location }, true); }); }); // TODO: we should review the class
names and whatnot in use here. Older heroes use id selectors, the newer // sticky question hero on SO has a .js-dismiss class instead,
but it's apparently not used anywhere... // It's not great. Ideally we'd have a set of classes in the partials above that would correspond to //
the behaviours we want here in a more clear way. // sticky question-page hero at the bottom of the page on SO $('.js-dismiss').on('click',
function () { StackExchange.using("gps", function () { StackExchange.gps.track("hero.action", { hero_action_type: "close", location:
location }, true); }); StackExchange.Hero.dismiss(); $(".js-dismissable-hero").fadeOut("fast"); }); });
1. 1. Home
2. Questions
3. Tags
4.
5. Users
6. Companies
7. Collectives
8. Explore Collectives
9. Labs
10. Discussions
2. Teams
Stack Overflow for Teams – Start collaborating and sharing organizational knowledge. Create a free Team Why Teams?
3. Teams
4.
Find centralized, trusted content and collaborate around the technologies you use most.
Teams
Connect and share knowledge within a single location that is structured and easy to search.
Why does C++ code for testing the Collatz conjecture run faster than hand-written assembly?
Ask Question
Asked 7 years, 2 months ago
Modified 2 months ago
Viewed 180k times
This question does not show any research effort; it is unclear or not useful
I wrote these two solutions for Project Euler Q14, in assembly and in C++. They implement identical brute force approach for testing theCollatz conjecture. The assembly solution was assembled with:
Assembly, p14.asm:
section .data
fmt db "%d", 10, 0
global main
extern printf
section .text
main:
mov rcx, 1000000
xor rdi, rdi ; max i
xor rsi, rsi ;i
l1:
dec rcx
xor r10, r10 ; count
mov rax, rcx
l2:
test rax, 1
jpe even
mov rbx, 3
mul rbx
inc rax
jmp c1
even:
mov rbx, 2
xor rdx, rdx
div rbx
c1:
inc r10
cmp rax, 1
jne l2
cmp rcx, 2
jne l1
C++, p14.cpp:
#include <iostream>
int sequence(long n) {
int count = 1;
while (n != 1) {
if (n % 2 == 0)
n /= 2;
else
n = 3*n + 1;
++count;
}
return count;
}
int main() {
int max = 0, maxi;
for (int i = 999999; i > 0; --i) {
int s = sequence(i);
if (s > max) {
max = s;
maxi = i;
}
}
std::cout << maxi << std::endl;
}
I know about the compiler optimizations to improve speed and everything, but I don’t see many ways to further optimize my assembly solution (speaking programmatically, not mathematically).
The C++ code uses modulus every term and division every other term, while the assembly code only uses a single division every other term.
But the assembly is taking on average 1 second longer than the C++ solution. Why is this? I am asking mainly out of curiosity.
Execution times
My system: 64-bit Linux on 1.4 GHz Intel Celeron 2955U (Haswell microarchitecture).
c++
performance
assembly
optimization
x86
Share
Share a link to this question
Follow
Follow this question to receive notifications
edited Sep 26, 2020 at 2:35
ib.
28.1k1111 gold badges8181 silver badges101101 bronze badges
asked Nov 1, 2016 at 6:12
rosghubrosghub
9,04444 gold badges2525 silver badges3737 bronze badges
48
264
Have you examined the assembly code that GCC generates for your C++ program?
– ruakh
Nov 1, 2016 at 6:14
83
Compile with -S to get the assembly that the compiler generated. The compiler is smart enough to realize that the modulus does the division at the same time.
– user3386109
Nov 1, 2016 at 6:15
291
I think your options are 1. Your measuring technique is flawed, 2. The compiler writes better assembly that you, or 3. The compiler uses magic.
– Galik
Nov 1, 2016 at 6:16
21
Generate the asm with the C code + line numbers inter-weavedand compare notes.
– legends2k
Nov 1, 2016 at 6:23
22
@jefferson The compiler can use faster brute force. For example maybe with SSE instructions.
– user253751
Nov 1, 2016 at 6:25
11 Answers
Sorted by: Reset to default
Highest score (default)
If you think a 64-bit DIV instruction is a good way to divide by two, then no wonder the compiler's asm output beat your hand-written code, even with
-O0 (compile fast, no extra optimization, and store/reload to memory
after/before every C statement so a debugger can modify variables).
See Agner Fog's Optimizing Assembly guide to learn how to write efficient asm. He also has instruction tables and a microarch guide for specific details for specific CPUs. See also the x86 tag wiki for more perf links.
See also this more general question about beating the compiler with hand-written asm:Is inline assembly language slower than native C++ code?. TL:DR: yes if you do it wrong (like this question).
Usually you're fine letting the compiler do its thing, especially if youtry to write C++ that can compile efficiently. Also see is assembly faster than compiled languages?. One of the answers links to these neat slides
showing how various C compilers optimize some really simple functions with cool tricks. Matt Godbolt's CppCon2017 talk “What Has My Compiler Done for Me Lately? Unbolting the Compiler's Lid” is in a similar
vein.
even:
mov rbx, 2
xor rdx, rdx
div rbx
On Intel Haswell, div r64 is 36 uops, with a latency of 32-96 cycles, and a throughput of one per 21-74 cycles. (Plus the 2 uops to set up RBX and zero RDX, but out-of-order execution can run those early). High-uop-count
instructions like DIV are microcoded, which can also cause front-end bottlenecks. In this case, latency is the most relevant factor because it's part of a loop-carried dependency chain.
shr rax, 1 does the same unsigned division: It's 1 uop, with 1c latency, and can run 2 per clock cycle.
For comparison, 32-bit division is faster, but still horrible vs. shifts.idiv r32 is 9 uops, 22-29c latency, and one per 8-11c throughput on Haswell.
As you can see from looking at gcc's -O0 asm output (Godbolt compiler explorer), it only uses shifts instructions. clang -O0 does compile naively like you thought, even using 64-bit IDIV twice. (When optimizing,
compilers do use both outputs of IDIV when the source does a division and modulus with the same operands, if they use IDIV at all)
GCC doesn't have a totally-naive mode; it always transforms through GIMPLE, which means some "optimizations" can't be disabled. This includes recognizing division-by-constant and using shifts (power of 2) or a fixed-
point multiplicative inverse (non power of 2) to avoid IDIV (seediv_by_13 in the above godbolt link).
gcc -Os (optimize for size) does use IDIV for non-power-of-2 division, unfortunately even in cases where the multiplicative inverse code is only slightly larger but much faster.
First of all, it's only interesting to look at optimized compiler output. (-O3).
-O0 speed is basically meaningless.
Look at your asm output (on Godbolt, or seeHow to remove "noise" from GCC/clang assembly output?). When the compiler doesn't make optimal code in the first place: Writing your C/C++ source in a way that guides
the compiler into making better code is usually the best approach. You have to know asm, and know what's efficient, but you apply this knowledge indirectly. Compilers are also a good source of ideas: sometimes
clang will do something cool, and you can hand-hold gcc into doing the same thing: see this answer and what I did with the non-unrolled loop in @Veedrac's code below.)
This approach is portable, and in 20 years some future compiler can compile it to whatever is efficient on future hardware (x86 or not), maybe using new ISA extension or auto-vectorizing. Hand-written x86-64 asm from 15
years ago would usually not be optimally tuned for Skylake. e.g. compare&branch macro-fusion didn't exist back then. What's optimal now for hand-crafted asm for one microarchitecture might not be optimal for
other current and future CPUs. Comments on @johnfound's answer discuss major differences between AMD Bulldozer and Intel Haswell, which have a big effect on this code. But in theory, g++ -O3 -march=bdver3 and g++ -
O3 -march=skylake will do the right thing. (Or -march=native.) Or -mtune=... to just tune, without using instructions that other CPUs might not support.
My feeling is that guiding the compiler to asm that's good for a current CPU you care about shouldn't be a problem for future compilers. They're hopefully better than current compilers at finding ways to transform code, and
can find a way that works for future CPUs. Regardless, future x86 probably won't be terrible at anything that's good on current x86, and the future compiler will avoid any asm-specific pitfalls while implementing something
like the data movement from your C source, if it doesn't see something better.
Hand-written asm is a black-box for the optimizer, so constant-propagation doesn't work when inlining makes an input a compile-time constant. Other optimizations are also affected. Read
https://gcc.gnu.org/wiki/DontUseInlineAsm before using asm. (And avoid MSVC-style inline asm: inputs/outputs have to go through memory which adds overhead.)
In this case: your n has a signed type, and gcc uses the SAR/SHR/ADD sequence that gives the correct rounding. (IDIV and arithmetic-shift "round" differently for negative inputs, see the SAR insn set ref manual entry).
(IDK if gcc tried and failed to prove that n can't be negative, or what. Signed-overflow is undefined behaviour, so it should have been able to.)
You should have used uint64_t n, so it can just SHR. And so it's portable to systems where long is only 32-bit (e.g. x86-64 Windows).
BTW, gcc's optimized asm output looks pretty good (using unsigned long n ): the inner loop it inlines into main() does this:
.L9: # do{
lea rcx, [rax+1+rax*2] # rcx = 3*n + 1
mov rdi, rax
shr rdi # rdi = n>>1;
test al, 1 # set flags based on n%2 (aka n&1)
mov rax, rcx
cmove rax, rdi # n= (n%2) ? 3*n+1 : n/2;
add edx, 1 # ++count;
cmp rax, 1
jne .L9 #}while(n!=1)
The inner loop is branchless, and the critical path of the loop-carried dependency chain is:
Total: 5 cycle per iteration, latency bottleneck. Out-of-order execution takes care of everything else in parallel with this (in theory: I haven't tested with perf counters to see if it really runs at 5c/iter).
The FLAGS input of cmov (produced by TEST) is faster to produce than the RAX input (from LEA->MOV), so it's not on the critical path.
Similarly, the MOV->SHR that produces CMOV's RDI input is off the critical path, because it's also faster than the LEA. MOV on IvyBridge and later has zero latency (handled at register-rename time). (It still takes a uop,
and a slot in the pipeline, so it's not free, just zero latency). The extra MOV in the LEA dep chain is part of the bottleneck on other CPUs.
The cmp/jne is also not part of the critical path: it's not loop-carried, because control dependencies are handled with branch prediction + speculative execution, unlike data dependencies on the critical path.
It could also save all the MOV instructions, and the TEST: SHR sets CF= the bit shifted out, so we can use cmovc instead of test / cmovz .
Even on Haswell, this version may help a bit by avoiding some occasional delays where a non-critical uop steals an execution port from one on the critical path, delaying execution by 1 cycle. (This is called a resource
conflict). It also saves a register, which may help when doing multiple n values in parallel in an interleaved loop (see below).
LEA's latency depends on the addressing mode, on Intel SnB-family CPUs before Ice Lake. 3c for 3 components ([base+idx+const], which takes two separate adds), but only 1c with 2 or fewer components (one add).
Some CPUs (like Core2) do even a 3-component LEA in a single cycle. Worse, SnB-family standardizes latencies: no 2c uops, otherwise 3-component LEA would be only 2c like Bulldozer. (3-component LEA is slower on
AMD too, just not by as much).
Ice Lake improved the LEA execution units to be 1c latency for all addressing modes, and 4/clock throughput except with a scaled index (then 2/clock). Alder Lake / Sapphire Rapids has 2c latency for shifted-index.
(https://uops.info/). Zen 3 and later run 3-component LEAs as 2 uops.
So lea rcx, [rax + rax*2] / inc rcx is only 2c latency, faster than lea rcx, [rax + rax*2 + 1] on Intel before Ice Lake. Break-even on BD and Alder Lake, and worse on Core2 and Ice Lake. It costs an extra uop which often isn't worth it to
save 1c latency, but latency is the major bottleneck here and HSW has a wide pipeline.
Neither GCC, ICC, nor Clang (on godbolt) used SHR's CF output, always using an AND or TEST. Silly compilers. :P They're great pieces of complex machinery, but a clever human can often beat them on small-scale
problems. (Given thousands to millions of times longer to think about it, of course! Compilers don't use exhaustive algorithms to search for every possible way to do things; that would take too long when optimizing a lot of
inlined code, which is what they do best. They also don't model the pipeline in the target uarch, not in the same detail as IACA or especially https://uica.uops.info/; they just use some heuristics.)
Simple loop unrolling won't help; this loop bottlenecks on the latency of a loop-carried dependency chain, not on loop overhead / throughput. This means it would do well with hyperthreading (or any other kind of SMT),
since the CPU has lots of time to interleave instructions from two threads. This would mean parallelizing the loop in main, but that's fine because each thread can just check a range ofn values and produce a pair of integers
as a result.
Interleaving by hand within a single thread might be viable, too. Maybe compute the sequence for a pair of numbers in parallel, since each one only takes a couple registers, and they can all update the same max / maxi.
This creates more instruction-level parallelism.
The trick is deciding whether to wait until all then values have reached 1 before getting another pair of starting n values, or whether to break out and get a new start point for just one that reached the end condition, without
touching the registers for the other sequence. Probably it's best to keep each chain working on useful data, otherwise you'd have to conditionally increment its counter.
You could maybe even do this with SSE packed-compare stuff to conditionally increment the counter for vector elements wheren hadn't reached 1 yet. And then to hide the even longer latency of a SIMD conditional-
increment implementation, you'd need to keep more vectors of n values up in the air. Maybe only worth with 256b vector (4x uint64_t).
I think the best strategy to make detection of a1 "sticky" is to mask the vector of all-ones that you add to increment the counter. So after you've seen a 1 in an element, the increment-vector will have a zero, and +=0 is a no-
op.
.inner_loop:
vpaddq ymm1, ymm0, xmm0
vpaddq ymm1, ymm1, xmm0
vpaddq ymm1, ymm1, set1_epi64(1) # ymm1= 3*n + 1. Maybe could do this more efficiently?
# FP blend between integer insns may cost extra bypass latency, but integer blends don't have 1 bit controlling a whole qword.
vpblendvpd ymm0, ymm0, ymm1, ymm3 # variable blend controlled by the sign bit of each 64-bit element. I might have the source operands backwards, I always have to look this up.
vpaddq ymm5, ymm5, ymm4 # count++ in elements where n has never been == 1
You can and should implement this with intrinsics, not hand-written asm.
@EOF points out that tzcnt (or bsf) could be used to do multiple n/=2 iterations in one step. To vectorize that efficiently, we probably need AVX-512 vplzcntq after isolating the lowest set bit withv & (v-1) ). Or just do multiple
scalar ns in parallel in different integer regs.
goto loop_entry; // C++ structured like the asm, for illustration only
do {
n = n*3 + 1;
loop_entry:
shift = _tzcnt_u64(n);
n >>= shift;
count += shift;
} while(n != 1);
This may do significantly fewer iterations, but variable-count shifts are slow on Intel SnB-family CPUs without BMI2. 3 uops, 2c latency for FLAGS, although only 1c for the actual data. (count=0 means the flags are
unmodified. They handle this as a data dependency, and take multiple uops because a uop can only have 2 inputs (pre-HSW/BDW anyway)). This is the kind of thing that people complaining about x86's crazy-CISC design
are referring to. It makes x86 CPUs slower than they would be if the ISA was designed from scratch today, even in a mostly-similar way. (i.e. this is part of the "x86 tax" of speed / power cost.) BMI2 SHRX/SHLX/SARX are
1 uop / 1c latency.
It also puts tzcnt (3c on Haswell and later) on the critical path, so it significantly lengthens the total latency of the loop-carried dependency chain. It does remove any need for a CMOV, or for preparing a register holding
n>>1 , though. @Veedrac's answer overcomes all this by deferring the tzcnt/shift for multiple iterations, which is highly effective (see below).
We can safely use BSF or TZCNT interchangeably, because n can never be zero at that point. TZCNT's machine-code decodes as BSF on CPUs that don't support BMI1. (Meaningless prefixes are ignored, so REP BSF
runs as BSF).
TZCNT performs much better than BSF on AMD CPUs that support it, so it can be a good idea to use REP BSF , even if you don't care about setting ZF if the input is zero rather than the output. Some compilers do this when
you use __builtin_ctzll even with -mno-bmi.
They perform the same on Intel CPUs, so just save the byte if that's all that matters. TZCNT on Intel (pre-Skylake) still has a false-dependency on the supposedly write-only output operand, just like BSF, to support the
undocumented behaviour that BSF with input = 0 leaves its destination unmodified. So you need to work around that unless optimizing only for Skylake, so there's nothing to gain from the extra REP byte. (Intel often goes
above and beyond what the x86 ISA manual requires, to avoid breaking widely-used code that depends on something it shouldn't, or that is retroactively disallowed. e.g. Windows 9x's assumes no speculative prefetching of
TLB entries, which was safe when the code was written,before Intel updated the TLB management rules.)
Anyway, LZCNT/TZCNT on Haswell have the same false dep as POPCNT: seethis Q&A. This is why in gcc's asm output for @Veedrac's code, you see itbreaking the dep chain with xor-zeroing on the register it's about to
use as TZCNT's destination when it doesn't use dst=src. Since TZCNT/LZCNT/POPCNT never leave their destination undefined or unmodified, this false dependency on the output on Intel CPUs is a performance bug /
limitation. Presumably it's worth some transistors / power to have them behave like other uops that go to the same execution unit. The only perf upside is interaction with another uarch limitation: they can micro-fuse a
memory operand with an indexed addressing mode on Haswell, but on Skylake where Intel removed the false dep for LZCNT/TZCNT they "un-laminate" indexed addressing modes while POPCNT can still micro-fuse any
addr mode.
I put tidied / improved C (which guides the compiler to produce better asm), and tested+working faster asm (in comments below the C) up on Godbolt: see the link in
@hidefromkgb's answer. (This answer hit the 30k char
limit from the large Godbolt URLs, but shortlinks can rot and were too long for goo.gl anyway.)
Also improved the output-printing to convert to a string and make onewrite() instead of writing one char at a time. This minimizes impact on timing the whole program withperf stat ./collatz (to record performance counters), and
I de-obfuscated some of the non-critical asm.
@Veedrac's code
I got a minor speedup from right-shifting as much as weknow needs doing, and checking to continue the loop. From 7.5s for limit=1e8 down to 7.275s, on Core2Duo (Merom), with an unroll factor of 16.
code + comments on Godbolt. Don't use this version with clang; it does something silly with the defer-loop. Using a tmp counterk and then adding it to count later changes what clang does, but thatslightly hurts gcc.
See discussion in comments: Veedrac's code is excellent on CPUs with BMI1 (i.e. not Celeron/Pentium)
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
edited Nov 14, 2023 at 19:33
answered Nov 1, 2016 at 7:04
4
I've tried out the vectorized approach a while ago, it didn't help (because you can do much better in scalar code withtzcnt and you're locked to the longest-running sequence among your vector-elements in the
vectorized case).
– EOF
Nov 1, 2016 at 8:57
3
@EOF: no, I meant breaking out of the inner loop when anyone of the vector elements hits 1, instead of when they all have (easily detectable with PCMPEQ/PMOVMSK). Then you use PINSRQ and stuff to fiddle
with the one element that terminated (and its counters), and jump back into the loop. That can easily turn into a loss, when you're breaking out of the inner loop too often, but it does mean you're always getting 2 or 4
elements of useful work done every iteration of the inner loop. Good point about memoization, though.
– Peter Cordes
Nov 1, 2016 at 9:27
6
@jefferson Best I managed is godbolt.org/g/1N70Ib. I was hoping I could do something smarter, but it seems not.
– Veedrac
Nov 2, 2016 at 21:47
157
The thing that amazes me about incredible answers such as this is the knowledge shown to such detail. I will never know a language or system to that level and I wouldn't know how. Well done sir.
– camden_kid
Nov 4, 2016 at 17:24
10
@csch: thanks. I'm glad so many people got something out of what I wrote. I'm pretty proud of it, and think it does a good job of explaining some optimization basics and specific details relevant for this problem.
– Peter Cordes
Nov 12, 2017 at 15:14
Claiming that the C++ compiler can produce more optimal code than a competent assembly language programmer is a very bad mistake. And especially in this case. The human always can make the code better than the
compiler can, and this particular situation is a good illustration of this claim.
The timing difference you're seeing is because the assembly code in the question is very far from optimal in the inner loops.
.seq:
inc esi ; counter
lea edx, [3*eax+1] ; edx = 3*n+1
shr eax, 1 ; eax = n/2
cmovc eax, edx ; if CF eax = edx
jnz .seq ; jmp if n<>1
include "%lib%/freshlib.inc"
@BinaryType console, compact
options.DebugMode = 1
include "%lib%/freshlib.asm"
start:
InitializeAll
mov ecx, 999999
xor edi, edi ; max
xor ebx, ebx ; max i
.main_loop:
.seq:
inc esi ; counter
lea edx, [3*eax+1] ; edx = 3*n+1
shr eax, 1 ; eax = n/2
cmovc eax, edx ; if CF eax = edx
jnz .seq ; jmp if n<>1
dec ecx
jnz .main_loop
In my tests, (1 GHz AMD A4-1200 processor), the above code is approximately four times faster than the C++ code from the question (when compiled with-O0: 430 ms vs. 1900 ms), and more than two times faster (430 ms
vs. 830 ms) when the C++ code is compiled with -O3.
The output of both programs is the same: max sequence = 525 on i = 837799.
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
edited Dec 29, 2020 at 12:59
Adrian Mole
50.8k177177 gold badges5353 silver badges8686 bronze badges
answered Nov 1, 2016 at 8:29
johnfoundjohnfound
6,93544 gold badges3131 silver badges6161 bronze badges
28
6
Huh, that's clever. SHR sets ZF only if EAX was 1 (or 0). I missed that when optimizing gcc's -O3 output, but I did spot all other optimizations you made to the inner loop. (But why do you use LEA for the counter
increment instead of INC? It's ok to clobber flags at that point, and lead to a slowdown on anything except maybe P4 (false dependency on old flags for both INC and SHR). LEA can't run on as many ports, and could
lead to resource conflicts delaying the critical path more often.)
– Peter Cordes
Nov 1, 2016 at 8:44
5
Oh, actually Bulldozer might bottleneck on throughput with the compiler output. It has lower latency CMOV and 3-component LEA than Haswell (which I was considering), so the loop-carried dep chain is only 3 cycles
in your code. It also doesn't have zero-latency MOV instructions for integer registers, so g++'s wasted MOV instructions actually increase the latency of the critical path, and are a big deal for Bulldozer. So yeah, hand-
optimization really does beat the compiler in a significant way for CPUs that aren't ultra-modern enough to chew through the useless instructions.
– Peter Cordes
Nov 1, 2016 at 9:02
110
"Claiming the C++ compiler better is very bad mistake. And especially in this case. The human always can make the code better that the and this particular problem is good illustration of this claim. " You can reverse it
and it would be just as valid. "Claiming a human is better is very bad mistake. And especially in this case. The human always can make the codeworse that the and this particular question is good illustration of this
claim." So I don't think you have a point here, such generalizations are wrong.
– luk32
Nov 1, 2016 at 15:16
7
@luk32 - But the author of the question can not be any argument at all, because his knowledge of assembly language is close to zero. Every arguments about human vs compiler, implicitly assume human with at least
some middle level of asm knowledge. More: The theorem "The human written code will always be better or the same as the compiler generated code" is very easy to be formally proved.
– johnfound
Nov 1, 2016 at 15:28
35
@luk32: A skilled human can (and usually should) start with compiler output. So as long as you benchmark your attempts to make sure they're actually faster (on the target hardware you're tuning for), you can't do
worse than the compiler. But yeah, I have to agree it's a bit of a strong statement. Compilers usually do much better than novice asm coders. But it's usually possible to save an instruction or two compared to what
compilers come up with. (Not always on the critical path, though, depending on uarch) . They're highly useful pieces of complex machinery, but they're not "smart".
– Peter Cordes
Nov 1, 2016 at 15:31
For more performance: A simple change is observing that after n = 3n+1, n will be even, so you can divide by 2 immediately. And n won't be 1, so you don't need to test for it. So you could save a few if statements and write:
while (n % 2 == 0) n /= 2;
if (n > 1) for (;;) {
n = (3*n + 1) / 2;
if (n % 2 == 0) {
do n /= 2; while (n % 2 == 0);
if (n == 1) break;
}
}
Here's a big win: If you look at the lowest 8 bits of n, all the steps until you divided by 2 eight times are completely determined by those eight bits. For example, if the last eight bits are 0x01, that is in binary your number is
???? 0000 0001 then the next steps are:
So all these steps can be predicted, and 256k + 1 is replaced with 81k + 1. Something similar will happen for all combinations. So you can make a loop with a big switch statement:
k = n / 256;
m = n % 256;
switch (m) {
case 0: n = 1 * k + 0; break;
case 1: n = 81 * k + 1; break;
case 2: n = 81 * k + 1; break;
...
case 155: n = 729 * k + 425; break;
...
}
Run the loop until n ≤ 128, because at that point n could become 1 with fewer than eight divisions by 2, and doing eight or more steps at a time would make you miss the point where you reach 1 for the first time. Then
continue the "normal" loop - or have a table prepared that tells you how many more steps are need to reach 1.
PS. I strongly suspect Peter Cordes' suggestion would make it even faster. There will be no conditional branches at all except one, and that one will be predicted correctly except when the loop actually ends. So the code
would be something like
In practice, you would measure whether processing the last 9, 10, 11, 12 bits of n at a time would be faster. For each bit, the number of entries in the table would double, and I excect a slowdown when the tables don't fit
into L1 cache anymore.
PPS. If you need the number of operations: In each iteration we do exactly eight divisions by two, and a variable number of (3n + 1) operations, so an obvious method to count the operations would be another array. But we
can actually calculate the number of steps (based on number of iterations of the loop).
We could redefine the problem slightly: Replace n with (3n + 1) / 2 if odd, and replace n with n / 2 if even. Then every iteration will do exactly 8 steps, but you could consider that cheating :-) So assume there were r
operations n <- 3n+1 and s operations n <- n/2. The result will be quite exactly n' = n * 3^r / 2^s, because n <- 3n+1 means n <- 3n * (1 + 1/3n). Taking the logarithm we find r = (s + log2 (n' / n)) / log2 (3).
If we do the loop until n ≤ 1,000,000 and have a precomputed table how many iterations are needed from any start point n ≤ 1,000,000 then calculating r as above, rounded to the nearest integer, will give the right result
unless s is truly large.
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
edited Jan 18, 2017 at 11:30
UmNyobe
22.6k99 gold badges6262 silver badges9090 bronze badges
answered Nov 2, 2016 at 10:04
gnasher729gnasher729
51.8k55 gold badges7676 silver badges100100 bronze badges
4
4
Or make data lookup tables for the multiply and add constants, instead of a switch. Indexing two 256-entry tables is faster than a jump table, and compilers probably aren't looking for that transformation.
– Peter Cordes
Nov 2, 2016 at 11:03
1
Hmm, I thought for a minute this observation might prove the Collatz conjecture, but no, of course not. For every possible trailing 8 bits, there's a finite number of steps until they're all gone. But some of those trailing
8-bit patterns will lengthen the rest of the bitstring by more than 8, so this can't rule out unbounded growth or a repeating cycle.
– Peter Cordes
Nov 2, 2016 at 11:09
To update count, you need a third array, right? adders[] doesn't tell you how many right-shifts were done.
– Peter Cordes
Nov 2, 2016 at 22:06
1
For larger tables, it would be worth using narrower types to increase cache density. On most architectures, a zero-extending load from a uint16_t is very cheap. On x86, it's just as cheap as zero-extending from 32-bit
unsigned int to uint64_t. (MOVZX from memory on Intel CPUs only needs a load-port uop, but AMD CPUs do need the ALU as well.) Oh BTW, why are you using size_t for lastBits? It's a 32-bit type with -m32, and even -mx32
(long mode with 32-bit pointers). It's definitely the wrong type for n. Just use unsigned.
– Peter Cordes
Nov 2, 2016 at 22:21
Add a comment |
When traversing the sequence, we can only get 3 possible cases in the 2-neighborhood of the current elementN (shown first):
1. [even] [odd]
2. [odd] [even]
3. [even] [even]
To leap past these 2 elements means to compute(N >> 1) + N + 1 , ((N << 1) + N + 1) >> 1 and N >> 2, respectively.
Let`s prove that for both cases (1) and (2) it is possible to use the first formula, (N >> 1) + N + 1 .
Case (1) is obvious. Case (2) implies (N & 1) == 1 , so if we assume (without loss of generality) that N is 2-bit long and its bits areba from most- to least-significant, then a = 1 , and the following holds:
(N << 1) + N + 1: (N >> 1) + N + 1:
b10 b1
b1 b
+ 1 +1
---- ---
bBb0 bBb
As proven, we can traverse the sequence 2 elements at a time, using a single ternary operation. Another 2× time reduction.
int main() {
uint64_t maxi, maxc;
maxi = sequence(1000000, &maxc);
printf("%llu, %llu\n", maxi, maxc);
return 0;
}
Here we compare n > 2 because the process may stop at 2 instead of 1 if the total length of the sequence is odd.
[EDIT:]
DEC RCX;
AND RCX, -2;
XOR RAX, RAX;
MOV RBX, RAX;
@main:
XOR RSI, RSI;
LEA RDI, [RCX + 1];
@loop:
ADD RSI, 2;
LEA RDX, [RDI + RDI*2 + 2];
SHR RDX, 1;
SHRD RDI, RDI, 2; ror rdi,2 would do the same thing
CMOVL RDI, RDX; Note that SHRD leaves OF = undefined with count>1, and this doesn't work on all CPUs.
CMOVS RDI, RDX;
CMP RDI, 2;
JA @loop;
SUB RCX, 2;
JA @main;
@itoa:
XOR RDX, RDX;
DIV RCX;
ADD RDX, '0';
PUSH RDX;
TEST RAX, RAX;
JNE @itoa;
PUSH RCX;
LEA RAX, [RBX + 1];
TEST RBX, RBX;
MOV RBX, RDI;
JNE @itoa;
POP RCX;
INC RDI;
MOV RDX, RDI;
@outp:
MOV RSI, RSP;
MOV RAX, RDI;
SYSCALL;
POP RAX;
TEST RAX, RAX;
JNE @outp;
See the C and an improved/bugfixed version of the asm by Peter Cordeson Godbolt. (editor's note: Sorry for putting my stuff in your answer, but my answer hit the 30k char limit from Godbolt links + text!)
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
edited Nov 4, 2016 at 6:42
answered Nov 1, 2016 at 19:35
hidefromkgbhidefromkgb
5,85411 gold badge1414 silver badges4545 bronze badges
12
2
There is no integral Q such that 12 = 3Q + 1 . Your first point is not right, methinks.
– Veedrac
Nov 2, 2016 at 13:36
1
@Veedrac: Been playing around with this: It can be implemented with better asm than the implementation in this answer, using ROR / TEST and only one CMOV. This asm code infinite-loops on my CPU, since it
apparently relies on OF, which is undefined after SHRD or ROR with count > 1. It also goes to great lengths to try to avoid mov reg, imm32 , apparently to save bytes, but then it uses the 64-bit version of register
everywhere, even for xor rax, rax , so it has lots of unnecessary REX prefixes. We obviously only need REX on the regs holding n in the inner loop to avoid overflow.
– Peter Cordes
Nov 2, 2016 at 14:56
1
Timing results (from a Core2Duo E6600: Merom 2.4GHz. Complex-LEA=1c latency, CMOV=2c). The best single-step asm inner-loop implementation (from Johnfound): 111ms per run of this @main loop.
Compiler output from my de-obfuscated version of this C (with some tmp vars): clang3.8 -O3 -march=core2: 96ms. gcc5.2: 108ms. From my improved version of clang's asm inner loop: 92ms (should see a much bigger
improvement on SnB-family, where complex LEA is 3c not 1c). From my improved + working version of this asm loop (using ROR+TEST, not SHRD): 87ms. Measured with 5 reps before printing
– Peter Cordes
Nov 2, 2016 at 15:46
3
Here are the first 66 record-setters (A006877 on OEIS); I've marked the even ones in bold:2, 3, 6, 7, 9, 18, 25, 27, 54, 73, 97, 129, 171, 231, 313, 327, 649, 703, 871, 1161, 2223, 2463, 2919, 3711, 6171, 10971,
13255, 17647, 23529, 26623, 34239, 35655, 52527, 77031, 106239, 142587, 156159, 216367, 230631, 410011, 511935, 626331, 837799, 1117065, 1501353, 1723519, 2298025, 3064033, 3542887, 3732423,
5649499, 6649279, 8400511, 11200681, 14934241, 15733191, 31466382, 36791535, 63728127, 127456254, 169941673, 226588897, 268549803, 537099606, 670617279, 1341234558
– ShreevatsaR
Nov 4, 2016 at 2:42
1
@hidefromkgb Great! And I appreciate your other point better too now: 4k+2 → 2k+1 → 6k+4 = (4k+2) + (2k+1) + 1, and 2k+1 → 6k+4 → 3k+2 = (2k+1) + (k) + 1. Nice observation!
– ShreevatsaR
Nov 4, 2016 at 6:54
C++ programs are translated to assembly programs during the generation of machine code from the source code. It would be virtually wrong to say assembly is slower than C++. Moreover, the binary code generated differs
from compiler to compiler. So a smart C++ compiler may produce binary code more optimal and efficient than a dumb assembler's code.
However I believe your profiling methodology has certain flaws. The following are general guidelines for profiling:
1. Make sure your system is in its normal/idle state. Stop all running processes (applications) that you started or that use CPU intensively (or poll over the network).
2. Your datasize must be greater in size.
3. Your test must run for something more than 5-10 seconds.
4. Do not rely on just one sample. Perform your test N times. Collect results and calculate the mean or median of the result.
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
edited Nov 1, 2016 at 13:45
4444
3,6111010 gold badges3232 silver badges4545 bronze badges
answered Nov 1, 2016 at 6:26
Yes I haven't done any formal profiling but I have ran them both a few times and am capable of telling 2 seconds from 3 seconds. Anyway thanks for answering. I already picked up a good deal of info here
– rosghub
Nov 1, 2016 at 6:50
9
It's probably not just a measurement error, the hand-written asm code is using a 64-bit DIV instruction instead of a right-shift. See my answer. But yes, measuring correctly is important, too.
– Peter Cordes
Nov 1, 2016 at 7:05
7
Bullet points are more appropriate formatting than a code block. Please stop putting your text into a code block, because it's not code and doesn't benefit from a monospaced font.
– Peter Cordes
Nov 1, 2016 at 9:33
18
I don't really see how this answers the question. This isn't a vague question about whether assembly code or C++ codemight be faster---it is a very specific question aboutactual code, which he's helpfully provided in
the question itself. Your answer doesn't even mention any of that code, or do any type of comparison. Sure, your tips on how to benchmark are basically correct, but not enough to make an actual answer.
– Cody Gray - on strike ♦
Nov 1, 2016 at 14:51
Add a comment |
From comments:
But, this code never stops (because of integer overflow) !?! Yves Daoust
If it will overflow - for one of those unlucky initial seeds, the overflown number will very likely converge toward 1 without another overflow.
Still this poses interesting question, is there some overflow-cyclic seed number?
Any simple final converging series starts with power of two value (obvious enough?).
2^64 will overflow to zero, which is undefined infinite loop according to algorithm (ends only with 1), but the most optimal solution in answer will finish due to
shr rax producing ZF=1.
Can we produce 2^64? If the starting number is0x5555555555555555, it's odd number, next number is then 3n+1, which is0xFFFFFFFFFFFFFFFF + 1 = 0. Theoretically in undefined state of algorithm, but the optimized answer of
johnfound will recover by exiting on ZF=1. The cmp rax,1 of Peter Cordes will end in infinite loop (QED variant 1, "cheapo" through undefined 0 number).
How about some more complex number, which will create cycle without0? Frankly, I'm not sure, my Math theory is too hazy to get any serious idea, how to deal with it in serious way. But intuitively I would say the series will
converge to 1 for every number : 0 < number, as the 3n+1 formula will slowly turn every non-2 prime factor of original number (or intermediate) into some power of 2, sooner or later. So we don't need to worry about infinite
loop for original series, only overflow can hamper us.
So I just put few numbers into sheet and took a look on 8 bit truncated numbers.
There are three values overflowing to 0: 227, 170 and 85 (85 going directly to 0, other two progressing toward 85).
Funnily enough I did a check, which is the first number to suffer from 8 bit truncation, and already27 is affected! It does reach value 9232 in proper non-truncated series (first truncated value is 322 in 12th step), and the
maximum value reached for any of the 2-255 input numbers in non-truncated way is 13120 (for the 255 itself), maximum number of steps to converge to1 is about 128 (+-2, not sure if "1" is to count, etc...).
Interestingly enough (for me) the number 9232 is maximum for many other source numbers, what's so special about it? :-O9232 = 0x2410 ... hmmm.. no idea.
Unfortunately I can't get any deep grasp of this series, why does it converge and what are the implications of truncating them tok bits, but with cmp number,1 terminating condition it's certainly possible to put the algorithm into
infinite loop with particular input value ending as 0 after truncation.
But the value 27 overflowing for 8 bit case is sort of alerting, this looks like if you count the number of steps to reach value1, you will get wrong result for majority of numbers from the total k-bit set of integers. For the 8 bit
integers the 146 numbers out of 256 have affected series by truncation (some of them may still hit the correct number of steps by accident maybe, I'm too lazy to check).
Share
Share a link to this answer
Copy link CC BY-SA 3.0
Follow
Follow this answer to receive notifications
answered Nov 1, 2016 at 17:18
Ped7gPed7g
16.4k33 gold badges2929 silver badges6565 bronze badges
5
"the overflown number will very likely converge toward 1 without another overflow": the code never stops. (That's a conjecture as I cannot wait until the end of times to be sure...)
– user1196549
Nov 1, 2016 at 17:25
@YvesDaoust oh, but it does?... for example the27 series with 8b truncation looks like this: 82 41 124 62 31 94 47 142 71 214 107 66 (truncated) 33 100 50 25 76 38 19 58 29 88 44 22 11 34 17 52 26 13 40 20 10 5
16 8 4 2 1 (rest of it works without truncation). I don't get you, sorry. It would never stop if the truncated value would be equal to some of previously reached in currently ongoing series, and I can't find any such value
vs k-bit truncation (but I either can't figure out the Math theory behind, why this holds up for 8/16/32/64 bits truncation, just intuitively I think it works).
– Ped7g
Nov 1, 2016 at 17:30
2
I should have checked the original problem deh2ion sooner: "Although it has not been proved yet (Collatz Problem), it is thought that all starting numbers finish at 1."... ok, no wonder I can't get grasp of it with my
limited hazy Math knowledge... :D And from my sheet experiments I can assure you it does converge for every 2-255 number, either without truncation (to 1), or with 8 bit truncation (to either expected 1 or to 0 for three
numbers).
– Ped7g
Nov 1, 2016 at 17:33
Hem, when I say that it never stops, I mean... that it does not stop. The given code runs forever if you prefer.
– user1196549
Nov 1, 2016 at 17:39
2
Upvoted for analysis of what happens on overflow. The CMP-based loop could use cmp rax,1 / jna (i.e. do{}while(n>1)) to also terminate on zero. I thought about making an instrumented version of the loop that records the
max n seen, to give an idea of how close we get to overflow.
– Peter Cordes
Nov 2, 2016 at 5:14
Add a comment |
You did not post the code generated by the compiler, so there' some guesswork here, but even without having seen it, one can say that this:
test rax, 1
jpe even
... has a 50% chance of mispredicting the branch, and that will come expensive.
The compiler almost certainly does both computations (which costs neglegibly more since the div/mod is quite long latency, so the multiply-add is "free") and follows up with a CMOV. Which, of course, has zero
a percent
chance of being mispredicted.
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
answered Nov 1, 2016 at 19:50
DamonDamon
68.4k2020 gold badges138138 silver badges187187 bronze badges
1
1
There is some pattern to the branching; e.g. an odd number is always followed by an even number. But sometimes 3n+1 leaves multiple trailing zero bits, and that's when this will mispredict. I started writing about
division in my answer, and didn't address this other big red flag in the OP's code. (Note also that using a parity condition is really weird, compared to just JZ or CMOVZ. It's also worse for the CPU, because Intel CPUs
can macro-fuse TEST/JZ, but not TEST/JPE. Agner Fog says AMD can fuse any TEST/CMP with any JCC, so in that case it's only worse for human readers)
– Peter Cordes
Nov 6, 2016 at 16:14
Add a comment |
For the Collatz problem, you can get a significant boost in performance by caching the "tails". This is a time/memory trade-off. See: memoizationhttps://en.wikipedia.org/wiki/Memoization).
( You could also look into dynamic
programming solutions for other time/memory trade-offs.
import sys
inner_loop = 0
l=[]
stop = False
n=N
tails = [ ]
return l
if __name__ == "__main__":
le_cache = {}
print("inner_loop = {}".format(inner_loop))
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
edited Nov 5, 2016 at 19:00
answered Nov 5, 2016 at 18:49
1
gnasher's answer shows that you can do much more than just cache the tails: high bits don't affect what happens next, and add / mul only propagate carry to the left, so high bits don't affect what happens to the low
bits. i.e. you can use LUT lookups to go 8 (or any number) of bits at a time, with multiply and add constants to apply to the rest of the bits. memoizing the tails is certainly helpful in a lot of problems like this, and for this
problem when you haven't thought of the better approach yet, or haven't proved it correct.
– Peter Cordes
Nov 5, 2016 at 22:56
2
If I understand gnasher's idea above correctly, I think tail memoization is an orthogonal optimization. So you could conceivably do both. It would be interesting to investigate how much you could gain from adding
memoization to gnasher's algorithm.
– Emanuel Landeholm
Nov 6, 2016 at 9:30
2
We can maybe make memoization cheaper by only storing the dense part of the results. Set an upper limit on N, and above that, don't even check memory. Below that, use hash(N) -> N as the hash function, so key =
position in the array, and doesn't need to be stored. An entry of 0 means not present yet. We can further optimize by only storing odd N in the table, so the hash function is n>>1, discarding the 1. Write the step code to
always end with a n>>tzcnt(n) or something to make sure it's odd.
– Peter Cordes
Nov 6, 2016 at 15:55
1
That's based on my (untested) idea that very large N values in the middle of a sequence are less likely to be common to multiple sequences, so we don't miss out on too much from not memoizing them. Also that a
reasonably-sized N will be part of many long sequences, even ones that start with very large N. (This may be wishful thinking; if it's wrong then only caching a dense range of consecutive N may lose out vs. a hash
table that can store arbitrary keys.) Have you done any kind of hit-rate testing to see if nearby starting N tend to have any similarity in their sequence values?
– Peter Cordes
Nov 6, 2016 at 16:06
2
You can just store pre-computed results for all n < N, for some large N. So you don't need the overhead of a hash table. The data in that tablewill be used eventually for every starting value. If you just want to confirm
that the Collatz sequence always ends in (1, 4, 2, 1, 4, 2, ...): This can be proven to be equivalent to proving that for n > 1, the sequence will eventually be less than the original n. And for that, caching tails will not
help.
– gnasher729
Nov 6, 2016 at 17:06
As a generic answer, not specifically directed at this task: In many cases, you can significantly speed up any program by making improvements at a high level. Like calculating data once instead of multiple times, avoiding
unnecessary work completely, using caches in the best way, and so on. These things are much easier to do in a high level language.
Writing assembler code, it is possible to improve on what an optimising compiler does, but it is hard work. And once it's done, your code is much harder to modify, so it is much more difficult to add algorithmic improvements.
Sometimes the processor has functionality that you cannot use from a high level language, inline assembly is often useful in these cases and still lets you use a high level language.
In the Euler problems, most of the time you succeed by building something, finding why it is slow, building something better, finding why it is slow, and so on and so on. That is very, very hard using assembler. A better
algorithm at half the possible speed will usually beat a worse algorithm at full speed, and getting the full speed in assembler isn't trivial.
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
answered Nov 4, 2016 at 17:15
gnasher729gnasher729
51.8k55 gold badges7676 silver badges100100 bronze badges
2
2
Totally agree with this. gcc -O3 made code that was within 20% of optimal on Haswell, for that exact algorithm. (Getting those speedups were the main focus of my answer only because that's what the question asked,
and has an interesting answer, not because it's the right approach.) Much bigger speedups were obtained from transformations that the compiler would be extremely unlikely to look for, like deferring right shifts, or
doing 2 steps at a time. Far bigger speedups than that can be had from memoization / lookup-tables. Still exhaustive testing, but not pure brute force.
– Peter Cordes
Nov 5, 2016 at 22:46
2
Still, having a simple implementation that's obviously correct is extremely useful for testing other implementations. What I'd do is probably just look at the asm output to see if gcc did it branchlessly like I expected
(mostly out of curiousity), and then move on to algorithmic improvements.
– Peter Cordes
Nov 5, 2016 at 22:50
Add a comment |
Even without looking at assembly, the most obvious reason is that /= 2 is probably optimized as >>=1 and many processors have a very quick shift operation. But even if a processor doesn't have a shift operation, the integer
division is faster than floating point division.
Edit: your milage may vary on the "integer division is faster than floating point division" statement above. The comments below reveal that the modern processors have prioritized optimizing fp division over integer division.
So if someone were looking for the most likely reason for the speedup which this thread's question asks about, then compiler optimizing /=2 as >>=1 would be the best 1st place to look.
On an unrelated note, if n is odd, the expression n*3+1 will always be even. So there is no need to check. You can change that branch to
{
n = (n*3+1) >> 1;
count += 2;
}
if (n & 1)
{
n = (n*3 + 1) >> 1;
count += 2;
}
else
{
n >>= 1;
++count;
}
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
edited Nov 29, 2016 at 17:35
answered Nov 1, 2016 at 21:16
4
Integer division is not actually faster than FP division on modern x86 CPUs. I think this is due to Intel/AMD spending more transistors on their FP dividers, because it's a more important operation. (Integer division by
constants can be optimized to a multiply by a modular inverse). Check Agner Fog's insn tables, and compare DIVSD (double-precision float) with DIV r32 (32-bit unsigned integer) or DIV r64 (much slower 64-bit unsigned
integer). Especially for throughput, FP division is much faster (single uop instead of micro-coded, and partially pipelined), but latency is better too.
– Peter Cordes
Nov 2, 2016 at 5:21
1
e.g. on the OP's Haswell CPU: DIVSD is 1 uop, 10-20 cycles latency, one per 8-14c throughput. div r64 is 36 uops, 32-96c latency, and one per 21-74c throughput. Skylake has even faster FP division throughput
(pipelined at one per 4c with not much better latency), but not much faster integer div. Things are similar on AMD Bulldozer-family: DIVSD is 1M-op, 9-27c latency, one per 4.5-11c throughput. div r64 is 16M-ops, 16-
75c latency, one per 16-75c throughput.
– Peter Cordes
Nov 2, 2016 at 5:23
1
Isn't FP division basically the same as integer-subtract exponents, integer-divide mantissa's, detect denormals? And those 3 steps can be done in parallel.
– MSalters
Nov 2, 2016 at 10:42
2
@MSalters: yeah, that sounds right, but with a normalization step at the end ot shift bits between exponent and mantiss. double has a 53-bit mantissa, but it's still significantly slower thandiv r32 on Haswell. So it's
definitely just a matter of how much hardware Intel/AMD throw at the problem, because they don't use the same transistors for both integer and fp dividers. The integer one is scalar (there's no integer-SIMD divide),
and the vector one handles 128b vectors (not 256b like other vector ALUs). The big thing is that integer div is many uops, big impact on surrounding code.
– Peter Cordes
Nov 2, 2016 at 16:17
Err, not shift bits between mantissa and exponent, but normalize the mantissa with a shift, and add the shift amount to the exponent.
– Peter Cordes
Nov 7, 2016 at 15:23
doing a MOV RBX, 3 and MUL RBX is expensive; just ADD RBX, RBX twice
64-bit code is usually noticeably slower than 32-bit code and the alignment issues are more complicated; with small programs like this you have to pack them so you are doing parallel computation to have any chance
of being faster than 32-bit code
If you generate the assembly listing for your C++ program, you can see how it differs from your assembly.
Share
Share a link to this answer
Follow
Follow this answer to receive notifications
edited Nov 7, 2016 at 17:36
answered Nov 7, 2016 at 1:03
4
1): adding 3 times would be dumb compared to LEA. Also mul rbx on the OP's Haswell CPU is 2 uops with 3c latency (and 1 per clock throughput). imul rcx, rbx, 3 is only 1 uop, with the same 3c latency. Two ADD
instructions would be 2 uops with 2c latency.
– Peter Cordes
Nov 7, 2016 at 15:27
6
2) ADD 1 is probably faster than INC here. Nope, the OP is not using a Pentium4. Your point 3) is the only correct part of this answer.
– Peter Cordes
Nov 7, 2016 at 15:28
5
4) sounds like total nonsense. 64-bit code can be slower with pointer-heavy data structures, because larger pointers means bigger cache footprint. But this code is working only in registers, and code alignment issues
are the same in 32 and 64 bit mode. (So are data alignment issues, no clue what you're talking about with alignment being a bigger issue for x86-64). Anyway, the code doesn't even touch memory inside the loop.
– Peter Cordes
Nov 7, 2016 at 15:31
The commenter has no idea what is talking about. Do a MOV+MUL on a 64-bit CPU will be roughly three times slower than adding a register to itself twice. His other remarks are equally incorrect.
– Tyler Durden
Nov 7, 2016 at 17:35
7
Well MOV+MUL is definitely dumb, but MOV+ADD+ADD is still silly (actually doing ADD RBX, RBX twice would multiply by 4, not 3). By far the best way is lea rax, [rbx + rbx*2] . Or, at the cost of making it a 3-component
LEA, do the +1 as well with lea rax, [rbx + rbx*2 + 1] (3c latency on HSW instead of 1, as I explained in my answer) My point was that 64-bit multiply is not very expensive on recent Intel CPUs, because they have insanely
fast integer multiply units (even compared to AMD, where the same MUL r64 is 6c latency, with one per 4c throughput: not even fully pipelined.
– Peter Cordes
Nov 7, 2016 at 21:06
Add a comment |
Highly active question. Earn 10 reputation (not counting the association bonus) in order to answer this question. The reputation requirement helps protect this question from spam and non-answer activity.
Not the answer you're looking for? Browse other questions tagged
c++
performance
assembly
optimization
x86
23 people chatting
Lounge<C++>
17 hours ago - TelKitty
//
Linked
2
NASM x86 Assembly Optimization [Linear congruential generator]
0
Ways to optimize the runtime of x86 assembly subroutine for collatz conjecture
639
Performance optimization strategies of last resort
1629
Replacing a 32-bit loop counter with 64-bit introduces crazy performance deviations with _mm_popcnt_u64 on Intel CPUs
502
When is assembly faster than C?
753
What does the C++ standard state the size of int, long type to be?
529
How do you get assembler output from C/C++ source in GCC?
203
Is inline assembly language slower than native C++ code?
89
Why do you program in assembly?
346
Deoptimizing a program for the pipeline in Intel Sandybridge-family CPUs
See more linked questions
Related
9
Assembly Performance Tuning
7
C++ - extremely strange machine code behaviour
190
Why does GCC generate such radically different assembly for nearly the same C code?
76
Why is this C++ program so incredibly fast?
12
Plain C++ Code 10 times faster than inline assembler. Why?
11
Harsh differences in generated assembly of floating-point comparisons < and >=
2
Core dumped in Assembly in collatz conjecture program
9
Why is one of these sooooo much faster than the other?
5
Why are x86-64 C/C++ compilers not generating more efficient assembly for this code?
2
Why is my assembly code much slower than the C implementation
Why is there never enough room on satellites to hold all the equipment needed?
Is the small signal model of a BJT amplifier valid for any sinusoidal signal?
Moshiach will be groomed in Arabia
Series which is a product of sequences which converges, but diverges when one of them is shifted by 1
Calculating conditional expectation for uniform distribution over a trapezoid
Handling Illegal Moves by Both Players in a Blitz Game: Rules and Arbiter's Role
Is there a name for the idea that some actions are good because they solve problems, even if they hurt people?
Help finding a short story - Protagonist finds out that people have voted for him to die
How much do you win in the Bingo Golf Flax Lottery?
Tactical flashlight vs martial arts for self defense?
How did PHS researchers prevent Tuskegee experiment participants from obtaining treatment on their own?
How to wrap rounded tcolorbox around lstinline?
Why would a production process in an industry know its (population) standard deviation?
Does The Boot override Intangible?
When Is the Use of 'rnano' Indicated?
Exact time evolution of Stern-Gerlach (SG) apparatus
What planet was the Doctor really from?
If the observable universe only has one galaxy, how would people know the expansion of the universe?
Septīmus or septĭmus?
Maximum number of pieces on a board
Does a rubber band stretch evenly?
How to think when grouping functionality into modules
How to I make a plane wrap around and object?
Where is the best place to pick up/drop off at Heathrow without paying?
Question feed
Subscribe to RSS
Question feed
To subscribe to this RSS feed, copy and paste this URL into your RSS reader.
https://stackoverflow.com/feeds/question/40354978
StackExchange.ready(function(){$.get('/posts/40354978/ivc/6036?prg=cc2b31f5-3309-48e7-94fe-d305a89ad79d');});
var cam = cam || { opt: {} }; var clcGamLoaderOptions = cam || { opt: {} }; var opt = clcGamLoaderOptions.opt; opt.omni =
'BwoLCI6Q396P_Mo8EAUYoomfEyACKAI6K3xjKyt8cGVyZm9ybWFuY2V8YXNzZW1ibHl8b3B0aW1pemF0aW9ufHg4NnxAAUgBWhIJ7NsPgD
opt.refresh = !1; opt.refreshInterval = 90; opt.sf = !0; opt.hb = !1; opt.ll = !0; opt.tlb_position = 0; opt.personalization_consent = !0;
opt.targeting_consent = !0; opt.performance_consent = !0; opt.targeting = {Registered:['false'],'so-tag':
['c_plus_plus','performance','assembly','optimization','x86'],'tag-reportable':
['c_plus_plus','performance','assembly','optimization','x86'],'so_tag':
['c_plus_plus','performance','assembly','optimization','x86'],NumberOfAnswers:['11']}; opt.adReportEnabled = !0; opt.adReportUrl =
'/ads/report-ad'; opt.adReportText = 'Report this ad'; opt.adReportFileTypeErrorMessage = 'Please select a PNG or JPG file.';
opt.adReportFileSizeErrorMessage = 'The file must be under 2 MiB.'; opt.adReportErrorText = 'Error uploading ad report.';
opt.adReportThanksText = 'Thanks for your feedback. We’ll review this against our code of conduct and take action if necessary.';
opt.adReportLoginExpiredMessage = 'Your login session has expired, please login and try again.'; opt.adReportLoginErrorMessage = 'An
error occurred when loading the report form - please try again'; opt.adReportModalClass = 'js-ad-report'; opt.brandSurveyEnabled = true;
opt.brandSurveySettings = [{"brandId":7,"lineItemIds":
[6170355049,6170355058,6170355244,6168829383,6170355787,6170355799,6168829803,6170356261,6170356282,6170984786,6170985065,616
{"brandId":8,"lineItemIds":[6389119380,6389119404,6389119347],"mode":"Collect"}]; opt.perRequestGuid = 'cc2b31f5-3309-48e7-94fe-
d305a89ad79d'; opt.responseHash = 'wveK4f/5Js9JCdBkANgTAaAtJb0Vor4HPEqsZNyIhj0='; opt.targeting.TargetingConsent = ['True'];
opt.allowAccountTargetingForThisRequest = !1; const urlParams = new URLSearchParams(window.location.search); if
(urlParams.has('dfptestads')) { const dfptestads = urlParams.get('dfptestads'); opt.targeting.DfpTestAds = dfptestads; }
;(()=>{"use strict";var __webpack_modules__={23:(e,t,s)=>{s.d(t,{Z7:()=>d,eq:()=>i,kG:()=>r});const
n=/^\/tags\//.test(location.pathname)||/^\/questions\/tagged\//.test(location.pathname)?"tag-
pages":/^\/$/.test(location.pathname)||/^\/home/.test(location.pathname)?"home-page":"question-pages";let o=location.hostname;const
a={slots:{lb:[[728,90]],mlb:[[728,90]],smlb:[[728,90]],bmlb:[[728,90]],sb:e=>"dfp-tsb"===e?[[300,250],[300,600]]:[[300,250]],"tag-
sponsorship":[[730,135]],"mobile-below-question":[[320,50],[300,250]],msb:[[300,250],[300,600]],"talent-conversion-tracking":[[1,1]],"site-
sponsorship":[[230,60]]},ids:{"dfp-tlb":"lb","dfp-mlb":"mlb","dfp-smlb":"smlb","dfp-bmlb":"bmlb","dfp-tsb":"sb","dfp-isb":"sb","dfp-
tag":"tag-sponsorship","dfp-msb":"msb","dfp-sspon":"site-sponsorship","dfp-m-aq":"mobile-below-
question"},idsToExcludeFromAdReports:["dfp-sspon"]};function r(){return Object.keys(a.ids)}function i(e){return
a.idsToExcludeFromAdReports.indexOf(e)<0}function d(e){var t=e.split("_")[0];const s=a.ids[t];let r=a.slots[s];return"function"==typeof
r&&(r=r(t)),{path:`/248424177/${o}/${s}/${n}`,sizes:r,zone:s}}},865:(e,t,s)=>{function n(e){return"string"==typeof e?
document.getElementById(e):e}function o(e){return!!(e=n(e))&&"none"===getComputedStyle(e).display}function a(e){return!o(e)}function
r(e){return!!e}function i(e){return/^\s*$/.test(n(e).innerHTML)}function d(e)
{const{style:t}=e;t.height=t.maxHeight=t.minHeight="auto",t.display="none"}function l(e)
{const{style:t}=e;t.height=t.maxHeight=t.minHeight="auto",t.display="none",[].forEach.call(e.children,l)}function c(e)
{const{style:t}=e;t.height=t.maxHeight=t.minHeight="auto",t.removeProperty("display")}function g(e){const
t=document.createElement("h2");t.src=e,document.body.appendChild(t)}function p(e){return s=e,(t=[]).push=function(e){return s(),delete
this.push,this.push(e)},t;var t,s}function h(e){let t="function"==typeof HTMLTemplateElement;var
s=document.createElement(t?"template":"div");return e=e.trim(),s.innerHTML=e,t?s.content.firstChild:s.firstChild}s.d(t,{$Z:()=>c,Bv:
()=>h,Gx:()=>g,Nj:()=>n,QZ:()=>p,cf:()=>d,pn:()=>a,wo:()=>l,xb:()=>i,xj:()=>o,yb:()=>r})},763:
(__unused_webpack_module,__webpack_exports__,__webpack_require__)=>{__webpack_require__.d(__webpack_exports__,{t:
()=>AdReports});var
_common_helper__WEBPACK_IMPORTED_MODULE_2__=__webpack_require__(865),_console__WEBPACK_IMPORTED_MODULE_1__=__
AdReports{constructor(e,t){if(this.googletag=e,this.cam=t,this.allowedFileTypes=
["image/png","image/jpg","image/jpeg"],this.ignoreValidation=!1,_console__WEBPACK_IMPORTED_MODULE_1__.cM("Ad reporting
init"),this.cam=t,this.callOnButtonClick=e=>this.onButtonClick(e),this.googletag.pubads().addEventListener("slotRenderEnded",e=>this.ha
{_console__WEBPACK_IMPORTED_MODULE_1__.cM("Adding report button to "+t.slotsRenderedEvents.length+" events that have
transpired");for(var s=0;se.addEventListener("change",e=>{this.$adReportReasonOther.classList.toggle("d-
none","3"!==e.target.value)})),this.$fileUploaderInput.addEventListener("change",()=>
{this.validateFileInput()&&this.updateImagePreview(this.$fileUploaderInput.files)}),this.$clearImageUpload.addEventListener("click",e=>
{e.preventDefault(),this.clearImageUpload()});try{this.$fileUploaderInput[0].value="",this.$imageUploader.addEventListener("dragenter
dragover dragleave drop",this.preventDefaults),this.$imageUploader.addEventListener("dragenter
dragover",this.handleDragStart),this.$imageUploader.addEventListener("dragleave
drop",this.handleDragEnd),this.$imageUploader.addEventListener("drop",this.handleDrop)}catch(e){s.classList.add("d-
none"),n.classList.remove("d-
none")}this.$form.removeEventListener("",this.handleDragEnd),this.$form.addEventListener("submit",async e=>
(e.preventDefault(),this.submitForm(),!1))}clearImageUpload()
{this.$fileUploaderInput.value="",this.$imageUploaderPreview.setAttribute("src",""),this.$imageUploaderPreview.classList.add("d-
none"),this.$clearImageUpload.classList.add("d-none"),this.$imageUploaderText.classList.remove("d-
none"),this.$imageUploader.classList.add("p16","ba","bas-dashed","bc-black-100")}preventDefaults(e)
{e.preventDefault(),e.stopPropagation()}handleDragStart(e){this.$imageUploader.classList.remove("bas-
dashed"),this.$imageUploader.classList.add("bas-solid","bc-black-100")}handleDragEnd(e){this.$imageUploader.classList.remove("bas-
solid","bc-black-100"),this.$imageUploader.classList.add("bas-dashed")}handleDrop(e){var
t=e.originalEvent.dataTransfer.files;FileReader&&t&&1===t.length&&
(this.$fileUploaderInput.files=t,this.validateFileInput()&&this.updateImagePreview(t))}setError(e)
{this.$fileErrorMessage.parentElement.classList.toggle("has-error",e)}updateImagePreview(e)
{this.$imageUploader.classList.remove("p16","ba","bas-dashed","bc-black-100"),this.$clearImageUpload.classList.remove("d-
none"),this.$imageUploaderText.classList.add("d-none");var t=new FileReader;t.onload=e=>{null!=e.target&&
(this.$imageUploaderPreview.setAttribute("src",e.target.result),this.$imageUploaderPreview.classList.remove("d-
none"))},t.readAsDataURL(e[0])}validateFileInput(){if(this.ignoreValidation)return!0;const
e=this.cam.opt.adReportFileTypeErrorMessage,t=this.cam.opt.adReportFileSizeErrorMessage;if(null==this.$fileUploaderInput.files)return!1
s=this.$fileUploaderInput.files[0];return null==s?(this.setError(!0),!1):this.allowedFileTypes.indexOf(s.type)<0?
(this.$fileErrorMessage.textContent=e,this.$fileErrorMessage.classList.remove("d-none"),this.setError(!0),!1):s.size>2097152?
(this.$fileErrorMessage.textContent=t,this.$fileErrorMessage.classList.remove("d-none"),this.setError(!0),!1):
(this.$fileErrorMessage.classList.add("d-none"),this.setError(!1),!0)}async gatherDiagnosticInfo(){return{BrowserVersion:await
this.getBrowserVersion()}}getElementSource(e){return e.outerHTML}getNestedIFrameElement(e){var t=e.querySelector("iframe");return
t.contentDocument?t.contentDocument.documentElement:t.contentWindow.document.documentElement}async getBrowserVersion()
{return await navigator.userAgentData.getHighEntropyValues(["fullVersionList"]).then(e=>JSON.stringify(e.fullVersionList))}async
submitForm(){if(!this.validateFileInput())return!1;this.$form.querySelector("[type=submit]").setAttribute("disabled","true");var
e=JSON.parse(this.$googleEventData.value||"{}");e.Reason=parseInt(this.$form.querySelector(".js-ad-report-
reason:checked").value,10),e.Deh2ion=this.$adReportReasonOther.value,this.$googleEventData.value=JSON.stringify(e);var t=new
FormData(this.$form);if("1"===t.get("shareDiagnosticInfo")){var s=await
this.gatherDiagnosticInfo();Object.keys(s).forEach(e=>t.append(e,s[e]))}try{const e=await window.fetch(this.$form.getAttribute("action"),
{method:this.$form.getAttribute("method"),body:t,cache:"no-cache"}),s=e.headers.get("content-type")||"",o=await e.text();if(!e.ok)throw
new Error("response not valid");if(0===s.indexOf("text/html")){var n=(0,_common_helper__WEBPACK_IMPORTED_MODULE_2__.Bv)
(o);const e=n?n.querySelector(".js-modal-
content"):null;if(_console__WEBPACK_IMPORTED_MODULE_1__.cM("$popupContent"),_console__WEBPACK_IMPORTED_MODULE_1__
new Error(`Could not find .js-modal-content in response from ${this.$form.getAttribute("action")}`);document.querySelector(".js-modal-
content").replaceWith(e)}else window.StackExchange.helpers.showToast(this.cam.opt.adReportThanksText,
{type:"success"}),this.removeModal()}catch(e){window.StackExchange.helpers.showToast(this.cam.opt.adReportErrorText,
{type:"danger"})}finally{let e=this.$form.querySelector("[type=submit]");e&&e.removeAttribute("disabled")}}}},276:(e,t,s)=>{function
n(...e){}function o(...e){}s.d(t,{cM:()=>n,vU:()=>o})}},__webpack_module_cache__={};function __webpack_require__(e){var
t=__webpack_module_cache__[e];if(void 0!==t)return t.exports;var s=__webpack_module_cache__[e]={exports:{}};return
__webpack_modules__[e](s,s.exports,__webpack_require__),s.exports}__webpack_require__.d=(e,t)=>{for(var s in
t)__webpack_require__.o(t,s)&&!__webpack_require__.o(e,s)&&Object.defineProperty(e,s,
{enumerable:!0,get:t[s]})},__webpack_require__.o=(e,t)=>Object.prototype.hasOwnProperty.call(e,t);var __webpack_exports__={};(()=>
{var e=__webpack_require__(276),t=(e=>(e[e.Above=0]="Above",e[e.Below=1]="Below",e))(t||{});const s=Object.assign({},
{"lib":"https://cdn.sstatic.net/clc/js/bundles/gam_loader_h2/gam_loader_h2.bundle.741.19812c68fd3e721bbe0e.min.js","style":null,"u":null
stackoverflow)|(blog\\.codinghorror)|(.*\\.googlesyndication)|(serverfault|askubuntu)|
([^\\.]+\\.stackexchange))\\.com$","wv":true,"al":false,"abd":true,"cpa_liid":[5882654614],"cpa_cid":
[138377597667],"dp":false,"tgt_to":1000,"tgt_u":"https://clc.stackoverflow.com/get-user-acct-tgt","tgt_e":true,"tgt_p":100});var
n=__webpack_require__(23),o=__webpack_require__(865),a=__webpack_require__(763);class r{constructor(t,s)
{this.googletag=t,this.interval=s,e.cM("Ad refresh init. interval:
"+s),this.googletag.pubads().addEventListener("impressionViewable",e=>this.onImpressionViewable(e)),e.cM("done enabling ad
refresh")}onImpressionViewable(t){var s=t.slot;e.cM("ad refresh - slot "+s.getSlotElementId()+" is viewable, initializing
refresh"),this.scheduleRefresh(s)}scheduleRefresh(e){setTimeout(()=>this.refreshAdSlot(e),1e3*this.interval)}static refreshMyAd(t,s){let
n=t.pubads().getSlots().find(e=>e.getSlotElementId()===s);n&&(e.cM("refreshMyAd - refreshing ad slot
"+s),t.pubads().refresh([n]))}refreshAdSlot(t){var s=t.getSlotElementId();this.isElementVisibleInBrowser(s)?(e.cM("refreshing ad slot
"+s),googletag.pubads().refresh([t])):(e.cM("refresh skipped this time; ad slot not
viewable:"+s),this.scheduleRefresh(t))}isElementVisibleInBrowser(e){var t=document.getElementById(e);if(null!==t){var
s=t.getBoundingClientRect();if(s.top>=0&&s.left>=0&&s.bottom<=
(window.innerHeight||document.documentElement.clientHeight)&&s.right<=
(window.innerWidth||document.documentElement.clientWidth))return!0}return!1}}var i=(e=>
(e.Off="Off",e.PreSurvey="PreSurvey",e.Collect="Collect",e.PostSurvey="PostSurvey",e))(i||{});class d{constructor(e,t)
{this.lineItemImpressions=[],this.surveysIdsCompleted=[],this.lineItemImpressions=e,this.surveysIdsCompleted=t}addImpression(e,t){let
s={brandId:e,lineItemId:t,timestamp:new Date};this.lineItemImpressions.push(s)}addBrandSurveyCompleted(e){-
1===this.surveysIdsCompleted.indexOf(e)&&this.surveysIdsCompleted.push(e)}getTotalBrandImpressions(){let e=new Map;for(let t of
this.lineItemImpressions)if(e.has(t.brandId)){let s=e.get(t.brandId);e.set(t.brandId,s+1)}else e.set(t.brandId,1);return
e}getBrandLineItemImpressions(e){let t={};for(let s of this.lineItemImpressions)if(s.brandId==e)if(void 0!==t[s.lineItemId]){let
e=t[s.lineItemId];t[s.lineItemId]=e+1}else t[s.lineItemId]=1;return t}}class l{constructor(){this.surveyEngagementLocalStorageKey="clc-
survey-engagement"}getBrandSurveyEngagement(){let
e=localStorage.getItem(this.surveyEngagementLocalStorageKey);if(null===e)return new d([],[]);let t=JSON.parse(e);return new
d(t.lineItemImpressions,t.surveysIdsCompleted)}saveBrandSurveyEngagement(e){let
t=JSON.stringify(e);localStorage.setItem(this.surveyEngagementLocalStorageKey,t)}}class c{constructor(){this.surveyRepository=new
l}getBrandSurveyEngagement(){return this.surveyRepository.getBrandSurveyEngagement()}recordImpression(e,t){let
s=this.getBrandSurveyEngagement();s.addImpression(e,t),this.surveyRepository.saveBrandSurveyEngagement(s)}recordBrandSurveyCom
{let t=this.getBrandSurveyEngagement();t.addBrandSurveyCompleted(e),this.surveyRepository.saveBrandSurveyEngagement(t)}}class
g{constructor(t,s){this.googletag=t,this.brandSettings=s,this.brandSlotMap=new Map,this.brandSurveyEngagementService=new
c,e.cM("Brand Survey init: "+JSON.stringify(s)),void 0!==s?
(this.googletag.pubads().addEventListener("slotRenderEnded",e=>this.handleSlotRendered(e)),this.googletag.pubads().addEventListener(
enabling Brand Survey")):e.cM("Brand Survey init: brandSettings is undefined, not initializing")}handleSlotRendered(t){e.cM("Brand
Survey - slot rendered - slot:"+JSON.stringify(t.slot.getSlotElementId())+" lineItem: "+t.lineItemId);let
s=this.findItemWithId(t.lineItemId);if(null===s||s.mode!==i.Collect)this.brandSlotMap.delete(t.slot.getSlotElementId());else{let e=
{brandId:s.brandId,lineItemId:t.lineItemId};this.brandSlotMap.set(t.slot.getSlotElementId(),e)}}onImpressionViewable(t){let
s=t.slot;if(e.cM("ad - Brand Survey - impression viewable. Details: "+JSON.stringify(s.getSlotElementId())),e.cM("ad - Brand Survey - slot
"+s.getSlotElementId()+" is viewable"),this.brandSlotMap.has(s.getSlotElementId())){let
t=this.brandSlotMap.get(s.getSlotElementId());e.cM("Brand Survey - brand "+t.brandId+" is
viewable"),this.recordImpression(this.brandSlotMap.get(s.getSlotElementId()))}}recordImpression(t){e.cM("ad - Brand Survey - recording
impression for brand "+t.brandId),this.brandSurveyEngagementService.recordImpression(t.brandId,t.lineItemId)}findItemWithId(t){return
e.cM("brand settings: "+JSON.stringify(this.brandSettings)),this.brandSettings.find(e=>e.lineItemIds.includes(t))||null}}const
p="response-brand-survey-submit|",h="request-brand-survey-metadata|",m="record-metric-on-server|",u="request-dsp-
tags",f="response-dsp-tags|";class v{static refreshAdIfBrandSurveyIsDuplicated(e,t,s)
{this.alreadyCompletedThisBrandSurvey(t)&&r.refreshMyAd(e,s)}static alreadyCompletedThisBrandSurvey(e){return(new
c).getBrandSurveyEngagement().surveysIdsCompleted.includes(e)}}window.cam=new class{constructor(t=null)
{if(this.gptImported=!1,this.slotsRenderedEvents=[],this.collapsed=
{},e.cM("constructor"),this.clc_options=s,window.clcGamLoaderOptions)Object.assign(this,window.clcGamLoaderOptions);else if(void
0===this.opt){let e=window.opt;e&&(this.opt=e)}}init(){if(e.cM("init"),void 0===this.opt)throw new Error("opt not set, required by GAM
Loader");e.cM("init brand survey service"),this.getUserAccountTargetingPromise=this.getUserAccountTargeting(),e.cM("setup message
handler"),window.addEventListener("message",e=>{this.onmessage(e)})}handleSlotRenderedNoAdReport()
{if(googletag.pubads().addEventListener("slotRenderEnded",e=>this.applyExtraMarginBottom(e)),Array.isArray(this.slotsRenderedEvents)
e=0;ee.json()).then(e=>i.contentWindow.postMessage(p,"*")).catch(e=>i.contentWindow.postMessage(p,"*"))}else
if(0===t.data.indexOf("brand-survey-completed-store|")){let s=t.data.split("|"),n=(s[1],s[2]);if(e.cM("Received brand survey completed
store message for survey ID "+n),v.alreadyCompletedThisBrandSurvey(+n))return void e.cM("Already completed this brand survey. Not
recording duplicate locally.");e.cM("Record brand survey completion locally"),(new c).recordBrandSurveyCompleted(+n)}else
if(0===t.data.indexOf(h)){let s=t.data.split("|"),n=s[1],o=s[2];e.cM("Received message: request-brand-survey-metadata| with Brand Survey
ID "+o);let a=(new
c).getBrandSurveyEngagement().getBrandLineItemImpressions(+n),r=JSON.stringify(a),i=this._getFrameByEvent(t);e.cM("sending
impression data: "+r),i.contentWindow.postMessage("response-brand-survey-
metadata|"+this.opt.responseHash+"|"+this.opt.perRequestGuid+"|"+r,"*")}else if(0===t.data.indexOf("refresh-if-duplicate-brand-
survey|")){let e=t.data.split("|")[1],s=this.getSlotElementIdByEvent(t);v.refreshAdIfBrandSurveyIsDuplicated(googletag,+e,s)}else
if(0===t.data.indexOf(m)){e.cM("Received message: record-metric-on-server| with args: "+t.data);let
s=t.data.split("|"),n=s[1],o=s[2],a=s[3],r=s[4],i=new
FormData;i.append("brandSurveyId",a.toString()),i.append("responseHash",this.opt.responseHash),i.append("perRequestGuid",this.opt.pe
{method:"POST",body:i}).then(e=>e.ok).catch(t=>{e.cM("SendMetricToServer: Error sending metric to server: "+t)})}else
if(0===t.data.indexOf(u)){e.cM("Received message: request-dsp-tags with args: "+t.data);let
s=this._getFrameByEvent(t);if(!this.opt.targeting["so-tag"])return void s.contentWindow.postMessage(f,"*");const
n=this.opt.targeting["so-tag"].join(",");e.cM("sending targeting tags: "+n),s.contentWindow.postMessage(f+n,"*")}else e.cM("Received
unhandled message")}getSlotElementIdByEvent(e){let t=this._getFrameByEvent(e).parentElement?.parentElement?.id;return
t||""}_getFrameByEvent(e){return Array.from(document.getElementsByTagName("iframe")).filter(t=>t.contentWindow===e.source)
[0]}classifyZoneIds(e){const
t=e.map(o.Nj).filter(o.yb);return{eligible:t.filter(o.xb).filter(o.pn),ineligible:t.filter(o.xj)}}applyExtraMarginBottom(t)
{if(t&&t.slot&&!t.isEmpty&&(t.creativeId||t.lineItemId||!t.isEmpty)){var s=t.slot.getSlotElementId();if(s){var
o=document.getElementById(s);if(o)if((0,n.eq)(s)){var a=o?.closest(".js-zone-container");a.style.marginBottom="24px",e.cM("Applied
extra margin to the bottom of "+s)}else e.cM("Not applying extra margin to the bottom of "+s+": shouldHaveReportButton = false");else
e.cM("Not applying extra margin to the bottom of "+s+": resolved invalid adUnit element")}else e.cM("Not applying extra margin to the
bottom of element: invalid adUnitElementId")}else e.cM("Not applying extra margin to the bottom of element: invalid
SlotRenderEndedEvent")}async load(s=(0,n.kG)()){const i=this.opt.tlb_position===t.Above?["dfp-mlb","dfp-smlb"]:["dfp-mlb","dfp-
smlb","dfp-tlb"];if(!this.isGptReady())return e.cM("Initializing..."),this.initGpt(),void
googletag.cmd.push(()=>this.load(s));this.opt.adReportEnabled?(e.cM("Ad reporting enabled"),this.adReports=new a.t(googletag,this)):
(e.cM("Ad reporting not enabled"),this.handleSlotRenderedNoAdReport()),this.opt.refresh?(e.cM("Ad refresh
enabled"),this.adRefresh=new r(googletag,this.opt.refreshInterval)):e.cM("Ad refresh not enabled"),this.opt.brandSurveyEnabled&&
(e.cM("Brand Survey enabled"),this.brandSurvey=new g(googletag,this.opt.brandSurveySettings)),e.cM("Attempting to load ads into ids:
",s);const{eligible:d,ineligible:l}=this.classifyZoneIds(s);if(this.initDebugPanel(googletag,d.concat(l)),d.forEach(e=>(0,o.cf)
(e)),l.forEach(o.wo),0===d.length)return void e.cM("Found no ad ids on page");e.cM("Eligible
ids:",d),this.opt.abd&&this.appendAdblockDetector();var c=googletag.pubads().getSlots();if(c){var
p=c.filter(e=>s.indexOf(e.getSlotElementId())>=0);googletag.destroySlots(p)}this.opt.sf&&
(googletag.pubads().setForceSafeFrame(!0),googletag.pubads().setSafeFrameConfig({allowOverlayExpansion:!0,allowPushExpansion:!0,sa
consent: Checking...");let h=!1,m=!1;void 0!==this.opt.targeting_consent&&(m=!0,e.cM("Targeting consent: Parameter
set"),e.cM("Targeting consent: Consent given? ",this.opt.targeting_consent),h=this.opt.targeting_consent),void
0!==this.opt.personalization_consent&&(e.cM("Personalization consent: Parameter set"),e.cM("Personalization consent: Consent given?
",this.opt.personalization_consent),h=h&&this.opt.personalization_consent),h=h&&m,this.setPrivacySettings(h),this.opt.ll||googletag.puba
(googletag.pubads().addEventListener("slotRenderEnded",e=>this.onSlotRendered(e)),cam.sreEvent=!0),await this.setTargeting();var
u=d.filter(e=>!this.opt.ll||i.indexOf(e.id)<0),f=d.filter(e=>!!this.opt.ll&&i.indexOf(e.id)>=0);e.cM("Up front ids:",u),e.cM("Lazy loaded
ids:",f),u.forEach(t=>{e.cM(`Defining ad for element ${t.id}`),this.defineSlot(t.id,googletag),t.setAttribute("data-dfp-
zone","true")}),googletag.enableServices(),u.forEach(t=>{e.cM(`Displaying ad for element
${t.id}`),googletag.cmd.push(()=>googletag.display(t.id))}),this.opt.ll&&(e.cM("Enabling lazy loading for
GAM"),googletag.pubads().enableLazyLoad({fetchMarginPercent:0,renderMarginPercent:0}),e.cM("Setting up lazy loaded ad
units"),f.forEach(t=>{e.cM(`Lazy loading - Defining Slot ${t.id}`),this.defineSlot(t.id,googletag)}),f.forEach(t=>{e.cM(`Lazy loading -
Displaying ad for element ${t.id}`),googletag.cmd.push(()=>googletag.display(t.id))}))}setPrivacySettings(e){e?
googletag.pubads().setPrivacySettings({limitedAds:!1,nonPersonalizedAds:!1}):googletag.pubads().setPrivacySettings({limitedAds:!0,nonP
setTargeting(){if(!googletag)throw new Error("googletag not defined");let t=this.opt.targeting;if(!t)throw new Error("Targeting not defined
(is "+typeof t+")");Object.keys(t).forEach(s=>{e.cM(`-> targeting - ${s}: ${t[s]}`),googletag.pubads().setTargeting(s,t[s])});let s=!1;if(void
0!==this.opt.targeting_consent&&(s=this.opt.targeting_consent),s){let t=(new
c).getBrandSurveyEngagement();if(t.getTotalBrandImpressions().forEach((t,s)=>{e.cM(`-> targeting - BrandImpressions: ${s}:
${t}`),googletag.pubads().setTargeting("brand_"+s.toString()+"_impressions",t.toString())}),t.surveysIdsCompleted.forEach(t=>{e.cM(`->
targeting - SurveysTaken:
${t}`),googletag.pubads().setTargeting("survey_"+t+"_taken","true")}),this.clc_options.tgt_e&&this.getUserAccountTargetingPromise){let
t=await this.getUserAccountTargetingPromise;t&&t.tgt_acct?(e.cM("-> targeting - User Account:
"+t.tgt_acct),googletag.pubads().setTargeting("user-
acct",t.tgt_acct.company_name),googletag.pubads().setTargeting("user_acct_top",t.tgt_acct.company_name),googletag.pubads().setTarge
> targeting - User Account: Not Found")}}}appendAdblockDetector(){const
e=document.createElement("div");e.className="adsbox",e.id="clc-
abd",e.style.position="absolute",e.style.pointerEvents="none",e.innerHTML=" ",document.body.appendChild(e)}onSlotRendered(s)
{try{const r=s.slot.getSlotElementId();let i=[];r||i.push("id=0");const
d=document.getElementById(r);if(r&&!d&&i.push("el=0"),0!==i.length)return void this.stalled(i.join("&"));const{path:l,sizes:c,zone:g}=
(0,n.Z7)(r);if(this.collapsed[g]&&s.isEmpty)return e.cM(`No line item for the element #${d.id}... collapsing.`),void(0,o.wo)
(d);if(this.slotsRenderedEvents.push(s),s.lineItemId||s.creativeId||!s.isEmpty){e.cM(`Rendered ad for element #${d.id} [line item
#${s.lineItemId}]`),e.cM(s);var a=d.parentElement;if(a.classList.contains("js-zone-container")){switch((0,o.cf)(a),r){case"dfp-
tlb":this.opt.tlb_position===t.Above?a.classList.add("mb8"):a.classList.add("mt16");break;case"dfp-
tag":a.classList.add("mb8");break;case"dfp-msb":a.classList.add("mt16");break;case"dfp-mlb":case"dfp-smlb":case"dfp-
bmlb":a.classList.add("my8");break;case"dfp-isb":a.classList.add("mt24");break;case"dfp-m-
aq":a.classList.add("my12"),a.classList.add("mx-auto")}(0,o.$Z)(a),(0,o.$Z)(d)}else e.cM(`No ad for element #${d.id}, collapsing`),e.cM(s),
(0,o.wo)(d)}}catch(t){e.cM("Exception thrown onSlotRendered"),e.cM(t),this.stalled("e=1")}}stalled(e){(new
Image).src=`https://${this.clc_options.h}/stalled.gif?${e}`}defineSlot(t,s){"dfp-isb"===t&&(e.cM("-> targeting - Sidebar:
Inline"),s.pubads().setTargeting("Sidebar",["Inline"])),"dfp-tsb"===t&&(e.cM("-> targeting - Sidebar:
Right"),s.pubads().setTargeting("Sidebar",["Right"]));const{path:o,sizes:a,zone:r}=(0,n.Z7)(t);e.cM(`Defining slot for ${t}: ${o}, sizes:
${JSON.stringify(a)}`),s.defineSlot(o,a,t).addService(s.pubads())}importGptLibrary(){this.gptImported||(this.gptImported=!0,void
0===this.opt.targeting_consent||this.opt.targeting_consent?(0,o.Gx)("https://securepubads.g.doubleclick.net/tag/js/gpt.js"):(0,o.Gx)
("https://pagead2.googlesyndication.com/tag/js/gpt.js"))}isGptReady(){return"undefined"!=typeof
googletag&&!!googletag.apiReady}initGpt(){"undefined"==typeof googletag&&(window.googletag={cmd:(0,o.QZ)
(()=>this.importGptLibrary())})}getUserAccountTargeting()
{if(this.opt.allowAccountTargetingForThisRequest&&this.clc_options.tgt_e&&this.clc_options.tgt_p>0){if(e.cM("Targeting
enabled."),this.clc_options.tgt_p<100){e.cM("Targeting rate limit enabled. Rolling the dice...");const
t=Math.floor(100*Math.random())+1;if(e.cM("Rolled "+t+" and the max is "+this.clc_options.tgt_p),t>this.clc_options.tgt_p)return void
e.cM("Will not request targeting.")}return e.cM("Will request targeting."),function(e,t,s){if(e){const e=new Headers;return
e.append("Accept","application/json"),async function(e,t={},s=5e3){if("number"!=typeof s&&null!=s&&!1!==s){if("string"!=typeof s)throw
new Error("fetchWithTimeout: timeout must be a number");if(s=parseInt(s),isNaN(s))throw new Error("fetchWithTimeout: timeout must be
a number (or string that can be parsed to a number)")}const n=new AbortController,{signal:o}=n,a=fetch(e,
{...t,signal:o}),r=setTimeout(()=>n.abort(),s);try{const e=await a;return clearTimeout(r),e}catch(e){throw clearTimeout(r),e}}(t,
{method:"GET",mode:"cors",headers:e},s).then(e=>e.json())}return Promise.reject("No consent")}
(this.opt.targeting_consent,this.clc_options.tgt_u,this.clc_options.tgt_to).catch(t=>{e.vU("Error fetching user account
targeting"),e.vU(t)})}e.cM("Targeting disabled. Will not request account targeting data.")}initDebugPanel(t,s)
{e.cM("initDebugPanel"),e.cM("Not showing debug panel")}},window.clcGamLoaderOptions&&(cam.init(),cam.load())})()})();
Stack Overflow
Questions
Help
Products
Teams
Advertising
Collectives
Talent
Company
About
Press
Work Here
Legal
Privacy Policy
Terms of Service
Contact Us
Cookie Settings
Cookie Policy
Technology
Culture & recreation
Life & arts
Science
Professional
Business
API
Data
Blog
Facebook
Twitter
LinkedIn
Instagram
Site design / logo © 2024 Stack Exchange Inc; user contributions licensed under CC BY-SA. rev 2024.1.10.3270
window.dataLayer = window.dataLayer || []; function gtag() { dataLayer.push(arguments); }
(function(i, s, o, g, r, a, m) { i['GoogleAnalyticsObject'] = r; i[r] = i[r] || function() { (i[r].q = i[r].q || []).push(arguments) }, i[r].l = 1 * new
Date(); a = s.createElement(o), m = s.getElementsByTagName(o)[0]; a.async = 1; a.src = g; m.parentNode.insertBefore(a, m); })(window,
document, 'h2', 'https://www.google-analytics.com/analytics.js', 'ga');
StackExchange.ready(function() { var ga3Settings = { autoLink:
["stackoverflow.blog","info.stackoverflowsolutions.com","stackoverflowsolutions.com"], sendTitles: true, tracker: window.ga,
trackingCodes: [ 'UA-108242619-1' ], checkDimension: 'dimension42' }; var customGA4Dimensions = {};
customGA4Dimensions["routename"] = "Questions/Show"; customGA4Dimensions["post_id"] = "40354978";
customGA4Dimensions["tags"] = "|c++|performance|assembly|optimization|x86|"; var ga4Settings = { tracker: gtag, trackingCodes: [ 'G-
WCZ03SZFCQ' ], consentsToPerformanceCookies: "granted", consentsToTargetingCookies: "granted", eventParameters:
customGA4Dimensions, checkForAdBlock: true, sendTitles: true, trackClicks: false, }; StackExchange.ga.init({ GA3: ga3Settings, GA4:
ga4Settings }); StackExchange.ga.setDimension('dimension2', '|c++|performance|assembly|optimization|x86|');
StackExchange.ga.setDimension('dimension3', 'Questions/Show'); StackExchange.ga.setDimension('dimension7',
"1705335873.1675853043"); StackExchange.ga.trackPageView(); });