Skip to content

Commit 5be1dab

Browse files
committed
Optimize pg_memory_is_all_zeros() in memutils.h
pg_memory_is_all_zeros() is currently implemented to do only a byte-per-byte comparison. While being sufficient for its existing callers for pgstats entries, it could lead to performance regressions should it be used for larger memory areas, like 8kB blocks, or even future commits. This commit optimizes the implementation of this function to be more efficient for larger sizes, written in a way so that compilers can optimize the code. This is portable across 32b and 64b architectures. The implementation handles three cases, depending on the size of the input provided: * If less than sizeof(size_t), do a simple byte-by-byte comparison. * If between sizeof(size_t) and (sizeof(size_t) * 8 - 1): ** Phase 1: byte-by-byte comparison, until the pointer is aligned. ** Phase 2: size_t comparisons, with aligned pointers, up to the last aligned location possible. ** Phase 3: byte-by-byte comparison, until the end location. * If more than (sizeof(size_t) * 8) bytes, this is the same as case 2 except that an additional phase is placed between Phase 1 and Phase 2, with 8 * sizeof(size_t) comparisons using bitwise OR, to encourage compilers to use SIMD instructions if available. The last improvement proves to be at least 3 times faster than the size_t comparisons, which is something currently used for the all-zero page check in PageIsVerifiedExtended(). The optimization tricks that would encourage the use of SIMD instructions have been suggested by David Rowley. Author: Bertrand Drouvot Reviewed-by: Michael Paquier, Ranier Vilela Discussion: https://postgr.es/m/CAApHDvq7P-JgFhgtxUPqhavG-qSDVUhyWaEX9M8_MNorFEijZA@mail.gmail.com
1 parent 7b88529 commit 5be1dab

File tree

1 file changed

+116
-3
lines changed

1 file changed

+116
-3
lines changed

src/include/utils/memutils.h

+116-3
Original file line numberDiff line numberDiff line change
@@ -190,19 +190,132 @@ extern MemoryContext BumpContextCreate(MemoryContext parent,
190190
#define SLAB_LARGE_BLOCK_SIZE (8 * 1024 * 1024)
191191

192192
/*
193+
* pg_memory_is_all_zeros
194+
*
193195
* Test if a memory region starting at "ptr" and of size "len" is full of
194196
* zeroes.
197+
*
198+
* The test is divided into multiple cases for safety reason and multiple
199+
* phases for efficiency.
200+
*
201+
* Case 1: len < sizeof(size_t) bytes, then byte-by-byte comparison.
202+
* Case 2: len < (sizeof(size_t) * 8 - 1) bytes:
203+
* - Phase 1: byte-by-byte comparison, until the pointer is aligned.
204+
* - Phase 2: size_t comparisons, with aligned pointers, up to the last
205+
* location possible.
206+
* - Phase 3: byte-by-byte comparison, until the end location.
207+
* Case 3: len >= (sizeof(size_t) * 8) bytes, same as case 2 except that an
208+
* additional phase is placed between Phase 1 and Phase 2, with
209+
* (8 * sizeof(size_t)) comparisons using bitwise OR to encourage
210+
* compilers to use SIMD instructions if available, up to the last
211+
* aligned location possible.
212+
*
213+
* Case 1 and Case 2 are mandatory to ensure that we won't read beyond the
214+
* memory area. This is portable for 32-bit and 64-bit architectures.
215+
*
216+
* Caller must ensure that "ptr" is not NULL.
195217
*/
196218
static inline bool
197219
pg_memory_is_all_zeros(const void *ptr, size_t len)
198220
{
199-
const char *p = (const char *) ptr;
221+
const unsigned char *p = (const unsigned char *) ptr;
222+
const unsigned char *end = &p[len];
223+
const unsigned char *aligned_end = (const unsigned char *)
224+
((uintptr_t) end & (~(sizeof(size_t) - 1)));
225+
226+
if (len < sizeof(size_t))
227+
{
228+
while (p < end)
229+
{
230+
if (*p++ != 0)
231+
return false;
232+
}
233+
return true;
234+
}
235+
236+
/* "len" in the [sizeof(size_t), sizeof(size_t) * 8 - 1] range */
237+
if (len < sizeof(size_t) * 8)
238+
{
239+
/* Compare bytes until the pointer "p" is aligned */
240+
while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
241+
{
242+
if (p == end)
243+
return true;
244+
if (*p++ != 0)
245+
return false;
246+
}
247+
248+
/*
249+
* Compare remaining size_t-aligned chunks.
250+
*
251+
* There is no risk to read beyond the memory area, as "aligned_end"
252+
* cannot be higher than "end".
253+
*/
254+
for (; p < aligned_end; p += sizeof(size_t))
255+
{
256+
if (*(size_t *) p != 0)
257+
return false;
258+
}
259+
260+
/* Compare remaining bytes until the end */
261+
while (p < end)
262+
{
263+
if (*p++ != 0)
264+
return false;
265+
}
266+
return true;
267+
}
268+
269+
/* "len" in the [sizeof(size_t) * 8, inf) range */
200270

201-
for (size_t i = 0; i < len; i++)
271+
/* Compare bytes until the pointer "p" is aligned */
272+
while (((uintptr_t) p & (sizeof(size_t) - 1)) != 0)
202273
{
203-
if (p[i] != 0)
274+
if (p == end)
275+
return true;
276+
277+
if (*p++ != 0)
278+
return false;
279+
}
280+
281+
/*
282+
* Compare 8 * sizeof(size_t) chunks at once.
283+
*
284+
* For performance reasons, we manually unroll this loop and purposefully
285+
* use bitwise-ORs to combine each comparison. This prevents boolean
286+
* short-circuiting and lets the compiler know that it's safe to access
287+
* all 8 elements regardless of the result of the other comparisons. This
288+
* seems to be enough to coax a few compilers into using SIMD
289+
* instructions.
290+
*/
291+
for (; p < aligned_end - (sizeof(size_t) * 7); p += sizeof(size_t) * 8)
292+
{
293+
if ((((size_t *) p)[0] != 0) | (((size_t *) p)[1] != 0) |
294+
(((size_t *) p)[2] != 0) | (((size_t *) p)[3] != 0) |
295+
(((size_t *) p)[4] != 0) | (((size_t *) p)[5] != 0) |
296+
(((size_t *) p)[6] != 0) | (((size_t *) p)[7] != 0))
297+
return false;
298+
}
299+
300+
/*
301+
* Compare remaining size_t-aligned chunks.
302+
*
303+
* There is no risk to read beyond the memory area, as "aligned_end"
304+
* cannot be higher than "end".
305+
*/
306+
for (; p < aligned_end; p += sizeof(size_t))
307+
{
308+
if (*(size_t *) p != 0)
204309
return false;
205310
}
311+
312+
/* Compare remaining bytes until the end */
313+
while (p < end)
314+
{
315+
if (*p++ != 0)
316+
return false;
317+
}
318+
206319
return true;
207320
}
208321

0 commit comments

Comments
 (0)