Here is the code from gcc testsuite. https://godbolt.org/z/jzdcsfxx4 ```c typedef char __attribute__ ((vector_size (16))) v16qi; typedef unsigned short __attribute__ ((vector_size (16))) v8hi; typedef unsigned int __attribute__ ((vector_size (16))) v4si; typedef unsigned long long __attribute__ ((vector_size (16))) v2di; typedef unsigned short __attribute__ ((vector_size (8))) v4hi; typedef unsigned int __attribute__ ((vector_size (8))) v2si; v2di G1 (v2di r) { return (r >> 32) | (r << 32); } v4si G2 (v4si r) { return (r >> 16) | (r << 16); } v8hi G3 (v8hi r) { return (r >> 8) | (r << 8); } v2si G4 (v2si r) { return (r >> 16) | (r << 16); } v4hi G5 (v4hi r) { return (r >> 8) | (r << 8); } ``` GCC efficiently uses rev32 or rev64 to complete the operation in a single instruction.