|
6 | 6 | * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group |
7 | 7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | 8 | * |
9 | | - * |
10 | 9 | * IDENTIFICATION |
11 | 10 | * src/backend/storage/page/checksum.c |
12 | 11 | * |
13 | 12 | *------------------------------------------------------------------------- |
14 | | - * |
15 | | - * Checksum algorithm |
16 | | - * |
17 | | - * The algorithm used to checksum pages is chosen for very fast calculation. |
18 | | - * Workloads where the database working set fits into OS file cache but not |
19 | | - * into shared buffers can read in pages at a very fast pace and the checksum |
20 | | - * algorithm itself can become the largest bottleneck. |
21 | | - * |
22 | | - * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand |
23 | | - * for Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 1 |
24 | | - * byte at a time according to the formula: |
25 | | - * |
26 | | - * hash = (hash ^ value) * FNV_PRIME |
27 | | - * |
28 | | - * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/ |
29 | | - * |
30 | | - * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of |
31 | | - * high bits - high order bits in input data only affect high order bits in |
32 | | - * output data. To resolve this we xor in the value prior to multiplication |
33 | | - * shifted right by 17 bits. The number 17 was chosen because it doesn't |
34 | | - * have common denominator with set bit positions in FNV_PRIME and empirically |
35 | | - * provides the fastest mixing for high order bits of final iterations quickly |
36 | | - * avalanche into lower positions. For performance reasons we choose to combine |
37 | | - * 4 bytes at a time. The actual hash formula used as the basis is: |
38 | | - * |
39 | | - * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17) |
40 | | - * |
41 | | - * The main bottleneck in this calculation is the multiplication latency. To |
42 | | - * hide the latency and to make use of SIMD parallelism multiple hash values |
43 | | - * are calculated in parallel. The page is treated as a 32 column two |
44 | | - * dimensional array of 32 bit values. Each column is aggregated separately |
45 | | - * into a partial checksum. Each partial checksum uses a different initial |
46 | | - * value (offset basis in FNV terminology). The initial values actually used |
47 | | - * were chosen randomly, as the values themselves don't matter as much as that |
48 | | - * they are different and don't match anything in real data. After initializing |
49 | | - * partial checksums each value in the column is aggregated according to the |
50 | | - * above formula. Finally two more iterations of the formula are performed with |
51 | | - * value 0 to mix the bits of the last value added. |
52 | | - * |
53 | | - * The partial checksums are then folded together using xor to form a single |
54 | | - * 32-bit checksum. The caller can safely reduce the value to 16 bits |
55 | | - * using modulo 2^16-1. That will cause a very slight bias towards lower |
56 | | - * values but this is not significant for the performance of the |
57 | | - * checksum. |
58 | | - * |
59 | | - * The algorithm choice was based on what instructions are available in SIMD |
60 | | - * instruction sets. This meant that a fast and good algorithm needed to use |
61 | | - * multiplication as the main mixing operator. The simplest multiplication |
62 | | - * based checksum primitive is the one used by FNV. The prime used is chosen |
63 | | - * for good dispersion of values. It has no known simple patterns that result |
64 | | - * in collisions. Test of 5-bit differentials of the primitive over 64bit keys |
65 | | - * reveals no differentials with 3 or more values out of 100000 random keys |
66 | | - * colliding. Avalanche test shows that only high order bits of the last word |
67 | | - * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes, |
68 | | - * overwriting page from random position to end with 0 bytes, and overwriting |
69 | | - * random segments of page with 0x00, 0xFF and random data all show optimal |
70 | | - * 2e-16 false positive rate within margin of error. |
71 | | - * |
72 | | - * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer |
73 | | - * multiplication instruction. As of 2013 the corresponding instruction is |
74 | | - * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). |
75 | | - * Vectorization requires a compiler to do the vectorization for us. For recent |
76 | | - * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough |
77 | | - * to achieve vectorization. |
78 | | - * |
79 | | - * The optimal amount of parallelism to use depends on CPU specific instruction |
80 | | - * latency, SIMD instruction width, throughput and the amount of registers |
81 | | - * available to hold intermediate state. Generally, more parallelism is better |
82 | | - * up to the point that state doesn't fit in registers and extra load-store |
83 | | - * instructions are needed to swap values in/out. The number chosen is a fixed |
84 | | - * part of the algorithm because changing the parallelism changes the checksum |
85 | | - * result. |
86 | | - * |
87 | | - * The parallelism number 32 was chosen based on the fact that it is the |
88 | | - * largest state that fits into architecturally visible x86 SSE registers while |
89 | | - * leaving some free registers for intermediate values. For future processors |
90 | | - * with 256bit vector registers this will leave some performance on the table. |
91 | | - * When vectorization is not available it might be beneficial to restructure |
92 | | - * the computation to calculate a subset of the columns at a time and perform |
93 | | - * multiple passes to avoid register spilling. This optimization opportunity |
94 | | - * is not used. Current coding also assumes that the compiler has the ability |
95 | | - * to unroll the inner loop to avoid loop overhead and minimize register |
96 | | - * spilling. For less sophisticated compilers it might be beneficial to manually |
97 | | - * unroll the inner loop. |
98 | 13 | */ |
99 | 14 | #include "postgres.h" |
100 | 15 |
|
101 | 16 | #include "storage/checksum.h" |
102 | 17 |
|
103 | | -/* number of checksums to calculate in parallel */ |
104 | | -#define N_SUMS 32 |
105 | | -/* prime multiplier of FNV-1a hash */ |
106 | | -#define FNV_PRIME 16777619 |
107 | | - |
108 | | -/* |
109 | | - * Base offsets to initialize each of the parallel FNV hashes into a |
110 | | - * different initial state. |
111 | | - */ |
112 | | -static const uint32 checksumBaseOffsets[N_SUMS] = { |
113 | | - 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, |
114 | | - 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, |
115 | | - 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, |
116 | | - 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, |
117 | | - 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, |
118 | | - 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, |
119 | | - 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, |
120 | | - 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 |
121 | | -}; |
122 | | - |
123 | 18 | /* |
124 | | - * Calculate one round of the checksum. |
| 19 | + * The actual code is in storage/checksum_impl.h. This is done so that |
| 20 | + * external programs can incorporate the checksum code by #include'ing |
| 21 | + * that file from the exported Postgres headers. (Compare our CRC code.) |
125 | 22 | */ |
126 | | -#define CHECKSUM_COMP(checksum, value) do {\ |
127 | | - uint32 __tmp = (checksum) ^ (value);\ |
128 | | - (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17);\ |
129 | | -} while (0) |
130 | | - |
131 | | -uint32 |
132 | | -checksum_block(char *data, uint32 size) |
133 | | -{ |
134 | | - uint32 sums[N_SUMS]; |
135 | | - uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data; |
136 | | - uint32 result = 0; |
137 | | - int i, |
138 | | - j; |
139 | | - |
140 | | - /* ensure that the size is compatible with the algorithm */ |
141 | | - Assert((size % (sizeof(uint32) * N_SUMS)) == 0); |
142 | | - |
143 | | - /* initialize partial checksums to their corresponding offsets */ |
144 | | - memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); |
145 | | - |
146 | | - /* main checksum calculation */ |
147 | | - for (i = 0; i < size / sizeof(uint32) / N_SUMS; i++) |
148 | | - for (j = 0; j < N_SUMS; j++) |
149 | | - CHECKSUM_COMP(sums[j], dataArr[i][j]); |
150 | | - |
151 | | - /* finally add in two rounds of zeroes for additional mixing */ |
152 | | - for (i = 0; i < 2; i++) |
153 | | - for (j = 0; j < N_SUMS; j++) |
154 | | - CHECKSUM_COMP(sums[j], 0); |
155 | | - |
156 | | - /* xor fold partial checksums together */ |
157 | | - for (i = 0; i < N_SUMS; i++) |
158 | | - result ^= sums[i]; |
159 | | - |
160 | | - return result; |
161 | | -} |
| 23 | +#include "storage/checksum_impl.h" |
0 commit comments