Skip to content

Commit d303122

Browse files
committed
Clean up the Simple-8b encoder code.
Coverity complained that simple8b_encode() might read beyond the end of the 'diffs' array, in the loop to encode the integers. That was a false positive, because we never get into the loop in modes 0 or 1, and the array is large enough for all the other modes. But I admit it's very subtle, so it's not surprising that Coverity didn't see it, and it's not very obvious to humans either. Refactor it, so that the second loop re-computes the differences, instead of carrying them over from the first loop in the 'diffs' array. This way, the 'diffs' array is not needed anymore. It makes no measurable difference in performance, and seems more straightforward this way. Also, improve the comments in simple8b_encode(): fix the comment about its return value that was flat-out wrong, and explain the condition when it returns EMPTY_CODEWORD better. In the passing, move the 'selector' from the codeword's low bits to the high bits. It doesn't matter much, but looking at the original paper, and googling around for other Simple-8b implementations, that's how it's usually done. Per Coverity, and Tom Lane's report off-list.
1 parent 148cf5f commit d303122

File tree

2 files changed

+41
-28
lines changed

2 files changed

+41
-28
lines changed

src/backend/lib/integerset.c

+30-22
Original file line numberDiff line numberDiff line change
@@ -773,7 +773,7 @@ intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
773773
*
774774
* Simple-8b algorithm packs between 1 and 240 integers into 64-bit words,
775775
* called "codewords". The number of integers packed into a single codeword
776-
* depends on the integers being packed: small integers are encoded using
776+
* depends on the integers being packed; small integers are encoded using
777777
* fewer bits than large integers. A single codeword can store a single
778778
* 60-bit integer, or two 30-bit integers, for example.
779779
*
@@ -783,11 +783,11 @@ intset_binsrch_leaf(uint64 item, leaf_item *arr, int arr_elems, bool nextkey)
783783
* of the absolute values.
784784
*
785785
* In Simple-8b, each codeword consists of a 4-bit selector, which indicates
786-
* how many integers are encoded in the codeword, and the encoded integers
786+
* how many integers are encoded in the codeword, and the encoded integers are
787787
* packed into the remaining 60 bits. The selector allows for 16 different
788-
* ways of using the remaining 60 bits, "modes". The number of integers
789-
* packed into a single codeword is listed in the simple8b_modes table below.
790-
* For example, consider the following codeword:
788+
* ways of using the remaining 60 bits, called "modes". The number of integers
789+
* packed into a single codeword in each mode is listed in the simple8b_modes
790+
* table below. For example, consider the following codeword:
791791
*
792792
* 20-bit integer 20-bit integer 20-bit integer
793793
* 1101 00000000000000010010 01111010000100100000 00000000000000010100
@@ -835,22 +835,28 @@ static const struct
835835
{20, 3}, /* mode 13: three 20-bit integers */
836836
{30, 2}, /* mode 14: two 30-bit integers */
837837
{60, 1}, /* mode 15: one 60-bit integer */
838+
838839
{0, 0} /* sentinel value */
839840
};
840841

841842
/*
842843
* EMPTY_CODEWORD is a special value, used to indicate "no values".
843844
* It is used if the next value is too large to be encoded with Simple-8b.
844845
*
845-
* This value looks like a 0-mode codeword, but we check for it
846+
* This value looks like a 0-mode codeword, but we check for it
846847
* specifically. (In a real 0-mode codeword, all the unused bits are zero.)
847848
*/
848-
#define EMPTY_CODEWORD UINT64CONST(0xFFFFFFFFFFFFFFF0)
849+
#define EMPTY_CODEWORD UINT64CONST(0x0FFFFFFFFFFFFFFF)
849850

850851
/*
851852
* Encode a number of integers into a Simple-8b codeword.
852853
*
853-
* Returns the number of integers that were encoded.
854+
* The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD
855+
* elements.
856+
*
857+
* Returns the encoded codeword, and sets *num_encoded to the number
858+
* input integers that were encoded. It can be zero, if the first input is
859+
* too large to be encoded.
854860
*/
855861
static uint64
856862
simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
@@ -861,7 +867,6 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
861867
uint64 diff;
862868
uint64 last_val;
863869
uint64 codeword;
864-
uint64 diffs[60];
865870
int i;
866871

867872
Assert(ints[0] > base);
@@ -891,13 +896,12 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
891896
selector++;
892897
nints = simple8b_modes[selector].num_ints;
893898
bits = simple8b_modes[selector].bits_per_int;
899+
894900
if (i >= nints)
895901
break;
896902
}
897903
else
898904
{
899-
if (i < 60)
900-
diffs[i] = diff;
901905
i++;
902906
if (i >= nints)
903907
break;
@@ -910,7 +914,13 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
910914

911915
if (nints == 0)
912916
{
913-
/* The next value is too large and be encoded with Simple-8b */
917+
/*
918+
* The first value is too large to be encoded with Simple-8b.
919+
*
920+
* If there is at least one not-too-large integer in the input, we
921+
* will encode it using mode 15 (or a more compact mode). Hence, we
922+
* only get here, if the *first* input integer is >= 2^60.
923+
*/
914924
Assert(i == 0);
915925
*num_encoded = 0;
916926
return EMPTY_CODEWORD;
@@ -924,16 +934,18 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
924934
codeword = 0;
925935
if (bits > 0)
926936
{
927-
for (i = nints - 1; i >= 0; i--)
937+
for (i = nints - 1; i > 0; i--)
928938
{
939+
diff = ints[i] - ints[i - 1] - 1;
940+
codeword |= diff;
929941
codeword <<= bits;
930-
codeword |= diffs[i];
931942
}
943+
diff = ints[0] - base - 1;
944+
codeword |= diff;
932945
}
933946

934947
/* add selector to the codeword, and return */
935-
codeword <<= 4;
936-
codeword |= selector;
948+
codeword |= (uint64) selector << 60;
937949

938950
*num_encoded = nints;
939951
return codeword;
@@ -945,7 +957,7 @@ simple8b_encode(uint64 *ints, int *num_encoded, uint64 base)
945957
static int
946958
simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
947959
{
948-
int selector = codeword & 0x0f;
960+
int selector = (codeword >> 60);
949961
int nints = simple8b_modes[selector].num_ints;
950962
uint64 bits = simple8b_modes[selector].bits_per_int;
951963
uint64 mask = (UINT64CONST(1) << bits) - 1;
@@ -954,8 +966,6 @@ simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
954966
if (codeword == EMPTY_CODEWORD)
955967
return 0;
956968

957-
codeword >>= 4; /* shift out the selector */
958-
959969
prev_value = base;
960970
for (int i = 0; i < nints; i++)
961971
{
@@ -976,15 +986,13 @@ simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
976986
static bool
977987
simple8b_contains(uint64 codeword, uint64 key, uint64 base)
978988
{
979-
int selector = codeword & 0x0f;
989+
int selector = (codeword >> 60);
980990
int nints = simple8b_modes[selector].num_ints;
981991
int bits = simple8b_modes[selector].bits_per_int;
982992

983993
if (codeword == EMPTY_CODEWORD)
984994
return false;
985995

986-
codeword >>= 4; /* shift out the selector */
987-
988996
if (bits == 0)
989997
{
990998
/* Special handling for 0-bit cases. */

src/test/modules/test_integerset/test_integerset.c

+11-6
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,7 @@ test_huge_distances(void)
539539
val = 0;
540540
values[num_values++] = val;
541541

542+
/* Test differences on both sides of the 2^60 boundary. */
542543
val += UINT64CONST(1152921504606846976) - 1; /* 2^60 - 1 */
543544
values[num_values++] = val;
544545

@@ -563,24 +564,28 @@ test_huge_distances(void)
563564
val += UINT64CONST(1152921504606846976) + 1; /* 2^60 + 1 */
564565
values[num_values++] = val;
565566

566-
val += UINT64CONST(1152921504606846976); /* 2^60 */
567+
val += UINT64CONST(1152921504606846976) + 2; /* 2^60 + 2 */
567568
values[num_values++] = val;
568569

569-
/* we're now very close to 2^64, so can't add large values anymore */
570+
val += UINT64CONST(1152921504606846976) + 2; /* 2^60 + 2 */
571+
values[num_values++] = val;
570572

571-
intset = intset_create();
573+
val += UINT64CONST(1152921504606846976); /* 2^60 */
574+
values[num_values++] = val;
572575

573576
/*
574-
* Add many more values to the end, to make sure that all the above values
575-
* get flushed and packed into the tree structure.
577+
* We're now very close to 2^64, so can't add large values anymore. But
578+
* add more smaller values to the end, to make sure that all the above
579+
* values get flushed and packed into the tree structure.
576580
*/
577581
while (num_values < 1000)
578582
{
579583
val += pg_lrand48();
580584
values[num_values++] = val;
581585
}
582586

583-
/* Add these numbers to the set */
587+
/* Create an IntegerSet using these values */
588+
intset = intset_create();
584589
for (int i = 0; i < num_values; i++)
585590
intset_add_member(intset, values[i]);
586591

0 commit comments

Comments
 (0)