|
19 | 19 |
|
20 | 20 | #include "access/toasterapi.h"
|
21 | 21 | #include "access/toast_compression.h"
|
| 22 | +#include "access/generic_toaster.h" |
22 | 23 | #include "common/pg_lzcompress.h"
|
23 | 24 | #include "fmgr.h"
|
24 | 25 | #include "utils/builtins.h"
|
@@ -247,6 +248,362 @@ lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength)
|
247 | 248 | #endif
|
248 | 249 | }
|
249 | 250 |
|
| 251 | +/* ---------- |
| 252 | + * pglz_decompress - |
| 253 | + * |
| 254 | + * Decompresses source into dest. Returns the number of bytes |
| 255 | + * decompressed into the destination buffer, or -1 if the |
| 256 | + * compressed data is corrupted. |
| 257 | + * |
| 258 | + * If check_complete is true, the data is considered corrupted |
| 259 | + * if we don't exactly fill the destination buffer. Callers that |
| 260 | + * are extracting a slice typically can't apply this check. |
| 261 | + * ---------- |
| 262 | + */ |
| 263 | +int32 |
| 264 | +pglz_decompress_state(const char *source, int32 *slen, char *dest, |
| 265 | + int32 dlen, bool check_complete, bool last_cource_chunk, |
| 266 | + void **pstate) |
| 267 | +{ |
| 268 | + pglz_state *state = pstate ? *pstate : NULL; |
| 269 | + const unsigned char *sp = (const unsigned char *) source; |
| 270 | + const unsigned char *srcend = sp + *slen; |
| 271 | + unsigned char *dp = (unsigned char *) dest; |
| 272 | + unsigned char *destend = dp + dlen; |
| 273 | + unsigned char ctrl; |
| 274 | + int ctrlc; |
| 275 | + int32 len; |
| 276 | + int32 remlen; |
| 277 | + int32 off; |
| 278 | + |
| 279 | + if (state) |
| 280 | + { |
| 281 | + ctrl = state->ctrl; |
| 282 | + ctrlc = state->ctrlc; |
| 283 | + |
| 284 | + if (state->len) |
| 285 | + { |
| 286 | + int32 copylen; |
| 287 | + |
| 288 | + len = state->len; |
| 289 | + off = state->off; |
| 290 | + |
| 291 | + copylen = Min(len, destend - dp); |
| 292 | + remlen = len - copylen; |
| 293 | + while (copylen--) |
| 294 | + { |
| 295 | + *dp = dp[-off]; |
| 296 | + dp++; |
| 297 | + } |
| 298 | + |
| 299 | + if (dp >= destend) |
| 300 | + { |
| 301 | + state->len = remlen; |
| 302 | + *slen = 0; |
| 303 | + return (char *) dp - dest; |
| 304 | + } |
| 305 | + |
| 306 | + Assert(remlen == 0); |
| 307 | + } |
| 308 | + |
| 309 | + remlen = 0; |
| 310 | + off = 0; |
| 311 | + |
| 312 | + if (ctrlc < 8 && sp < srcend && dp < destend) |
| 313 | + goto ctrl_loop; |
| 314 | + } |
| 315 | + else |
| 316 | + { |
| 317 | + ctrl = 0; |
| 318 | + ctrlc = 8; |
| 319 | + remlen = 0; |
| 320 | + off = 0; |
| 321 | + } |
| 322 | + |
| 323 | + while (sp < srcend && dp < destend) |
| 324 | + { |
| 325 | + /* |
| 326 | + * Read one control byte and process the next 8 items (or as many as |
| 327 | + * remain in the compressed input). |
| 328 | + */ |
| 329 | + ctrl = *sp++; |
| 330 | + |
| 331 | + for (ctrlc = 0; ctrlc < 8 && sp < srcend && dp < destend; ctrlc++) |
| 332 | + { |
| 333 | +ctrl_loop: |
| 334 | + if (ctrl & 1) |
| 335 | + { |
| 336 | + int32 copylen; |
| 337 | + |
| 338 | + /* |
| 339 | + * Set control bit means we must read a match tag. The match |
| 340 | + * is coded with two bytes. First byte uses lower nibble to |
| 341 | + * code length - 3. Higher nibble contains upper 4 bits of the |
| 342 | + * offset. The next following byte contains the lower 8 bits |
| 343 | + * of the offset. If the length is coded as 18, another |
| 344 | + * extension tag byte tells how much longer the match really |
| 345 | + * was (0-255). |
| 346 | + */ |
| 347 | + len = (sp[0] & 0x0f) + 3; |
| 348 | + off = ((sp[0] & 0xf0) << 4) | sp[1]; |
| 349 | + sp += 2; |
| 350 | + if (len == 18) |
| 351 | + len += *sp++; |
| 352 | + |
| 353 | + /* |
| 354 | + * Check for corrupt data: if we fell off the end of the |
| 355 | + * source, or if we obtained off = 0, we have problems. (We |
| 356 | + * must check this, else we risk an infinite loop below in the |
| 357 | + * face of corrupt data.) |
| 358 | + */ |
| 359 | + if (unlikely((sp > srcend && last_cource_chunk) || off == 0)) |
| 360 | + return -1; |
| 361 | + |
| 362 | + /* |
| 363 | + * Don't emit more data than requested. |
| 364 | + */ |
| 365 | + copylen = Min(len, destend - dp); |
| 366 | + remlen = len - copylen; |
| 367 | + |
| 368 | + /* |
| 369 | + * Now we copy the bytes specified by the tag from OUTPUT to |
| 370 | + * OUTPUT (copy len bytes from dp - off to dp). The copied |
| 371 | + * areas could overlap; to prevent possible uncertainty, we |
| 372 | + * copy only non-overlapping regions. |
| 373 | + */ |
| 374 | + while (off < copylen) |
| 375 | + { |
| 376 | + /* |
| 377 | + * We can safely copy "off" bytes since that clearly |
| 378 | + * results in non-overlapping source and destination. |
| 379 | + */ |
| 380 | + memcpy(dp, dp - off, off); |
| 381 | + copylen -= off; |
| 382 | + dp += off; |
| 383 | + |
| 384 | + /*---------- |
| 385 | + * This bit is less obvious: we can double "off" after |
| 386 | + * each such step. Consider this raw input: |
| 387 | + * 112341234123412341234 |
| 388 | + * This will be encoded as 5 literal bytes "11234" and |
| 389 | + * then a match tag with length 16 and offset 4. After |
| 390 | + * memcpy'ing the first 4 bytes, we will have emitted |
| 391 | + * 112341234 |
| 392 | + * so we can double "off" to 8, then after the next step |
| 393 | + * we have emitted |
| 394 | + * 11234123412341234 |
| 395 | + * Then we can double "off" again, after which it is more |
| 396 | + * than the remaining "len" so we fall out of this loop |
| 397 | + * and finish with a non-overlapping copy of the |
| 398 | + * remainder. In general, a match tag with off < len |
| 399 | + * implies that the decoded data has a repeat length of |
| 400 | + * "off". We can handle 1, 2, 4, etc repetitions of the |
| 401 | + * repeated string per memcpy until we get to a situation |
| 402 | + * where the final copy step is non-overlapping. |
| 403 | + * |
| 404 | + * (Another way to understand this is that we are keeping |
| 405 | + * the copy source point dp - off the same throughout.) |
| 406 | + *---------- |
| 407 | + */ |
| 408 | + off += off; |
| 409 | + } |
| 410 | + memcpy(dp, dp - off, copylen); |
| 411 | + dp += copylen; |
| 412 | + } |
| 413 | + else |
| 414 | + { |
| 415 | + /* |
| 416 | + * An unset control bit means LITERAL BYTE. So we just copy |
| 417 | + * one from INPUT to OUTPUT. |
| 418 | + */ |
| 419 | + *dp++ = *sp++; |
| 420 | + } |
| 421 | + |
| 422 | + /* |
| 423 | + * Advance the control bit |
| 424 | + */ |
| 425 | + ctrl >>= 1; |
| 426 | + } |
| 427 | + } |
| 428 | + |
| 429 | + /* |
| 430 | + * If requested, check we decompressed the right amount. |
| 431 | + */ |
| 432 | + if (check_complete && (dp != destend || sp != srcend)) |
| 433 | + return -1; |
| 434 | + |
| 435 | + if (pstate) |
| 436 | + { |
| 437 | + if (!state) |
| 438 | + *pstate = state = palloc(sizeof(*state)); |
| 439 | + |
| 440 | + state->ctrl = ctrl; |
| 441 | + state->ctrlc = ctrlc; |
| 442 | + state->len = remlen; |
| 443 | + state->off = off; |
| 444 | + |
| 445 | + *slen = (const char *) sp - source; |
| 446 | + } |
| 447 | + |
| 448 | + /* |
| 449 | + * That's it. |
| 450 | + */ |
| 451 | + return (char *) dp - dest; |
| 452 | +} |
| 453 | + |
| 454 | +#if 0 |
| 455 | +/* ---------- |
| 456 | + * pglz_decompress_iterate - |
| 457 | + * |
| 458 | + * This function is based on pglz_decompress(), with these additional |
| 459 | + * requirements: |
| 460 | + * |
| 461 | + * 1. We need to save the current control byte and byte position for the |
| 462 | + * caller's next iteration. |
| 463 | + * |
| 464 | + * 2. In pglz_decompress(), we can assume we have all the source bytes |
| 465 | + * available. This is not the case when we decompress one chunk at a |
| 466 | + * time, so we have to make sure that we only read bytes available in the |
| 467 | + * current chunk. |
| 468 | + * ---------- |
| 469 | + */ |
| 470 | +void |
| 471 | +pglz_decompress_iterate(ToastBuffer *source, ToastBuffer *dest, |
| 472 | + DetoastIterator iter, const char *destend) |
| 473 | +{ |
| 474 | + const unsigned char *sp; |
| 475 | + const unsigned char *srcend; |
| 476 | + unsigned char *dp; |
| 477 | + |
| 478 | + /* |
| 479 | + * In the while loop, sp may be incremented such that it points beyond |
| 480 | + * srcend. To guard against reading beyond the end of the current chunk, |
| 481 | + * we set srcend such that we exit the loop when we are within four bytes |
| 482 | + * of the end of the current chunk. When source->limit reaches |
| 483 | + * source->capacity, we are decompressing the last chunk, so we can (and |
| 484 | + * need to) read every byte. |
| 485 | + */ |
| 486 | + srcend = (const unsigned char *) |
| 487 | + (source->limit == source->capacity ? source->limit : (source->limit - 4)); |
| 488 | + sp = (const unsigned char *) source->position; |
| 489 | + dp = (unsigned char *) dest->limit; |
| 490 | + if (destend > (unsigned char *) dest->capacity) |
| 491 | + destend = (unsigned char *) dest->capacity; |
| 492 | + |
| 493 | + if (iter->len) |
| 494 | + { |
| 495 | + int32 len = iter->len; |
| 496 | + int32 off = iter->off; |
| 497 | + int32 copylen = Min(len, destend - dp); |
| 498 | + int32 remlen = len - copylen; |
| 499 | + |
| 500 | + while (copylen--) |
| 501 | + { |
| 502 | + *dp = dp[-off]; |
| 503 | + dp++; |
| 504 | + } |
| 505 | + |
| 506 | + iter->len = remlen; |
| 507 | + |
| 508 | + if (dp >= destend) |
| 509 | + { |
| 510 | + dest->limit = (char *) dp; |
| 511 | + return; |
| 512 | + } |
| 513 | + |
| 514 | + Assert(remlen == 0); |
| 515 | + } |
| 516 | + |
| 517 | + while (sp < srcend && dp < destend) |
| 518 | + { |
| 519 | + /* |
| 520 | + * Read one control byte and process the next 8 items (or as many as |
| 521 | + * remain in the compressed input). |
| 522 | + */ |
| 523 | + unsigned char ctrl; |
| 524 | + int ctrlc; |
| 525 | + |
| 526 | + if (iter->ctrlc != INVALID_CTRLC) |
| 527 | + { |
| 528 | + ctrl = iter->ctrl; |
| 529 | + ctrlc = iter->ctrlc; |
| 530 | + } |
| 531 | + else |
| 532 | + { |
| 533 | + ctrl = *sp++; |
| 534 | + ctrlc = 0; |
| 535 | + } |
| 536 | + |
| 537 | + for (; ctrlc < INVALID_CTRLC && sp < srcend && dp < destend; ctrlc++) |
| 538 | + { |
| 539 | + |
| 540 | + if (ctrl & 1) |
| 541 | + { |
| 542 | + /* |
| 543 | + * Set control bit means we must read a match tag. The match |
| 544 | + * is coded with two bytes. First byte uses lower nibble to |
| 545 | + * code length - 3. Higher nibble contains upper 4 bits of the |
| 546 | + * offset. The next following byte contains the lower 8 bits |
| 547 | + * of the offset. If the length is coded as 18, another |
| 548 | + * extension tag byte tells how much longer the match really |
| 549 | + * was (0-255). |
| 550 | + */ |
| 551 | + int32 len; |
| 552 | + int32 off; |
| 553 | + int32 copylen; |
| 554 | + |
| 555 | + len = (sp[0] & 0x0f) + 3; |
| 556 | + off = ((sp[0] & 0xf0) << 4) | sp[1]; |
| 557 | + sp += 2; |
| 558 | + if (len == 18) |
| 559 | + len += *sp++; |
| 560 | + |
| 561 | + /* |
| 562 | + * Now we copy the bytes specified by the tag from OUTPUT to |
| 563 | + * OUTPUT (copy len bytes from dp - off to dp). The copied |
| 564 | + * areas could overlap; to prevent possible uncertainty, we |
| 565 | + * copy only non-overlapping regions. |
| 566 | + */ |
| 567 | + copylen = Min(len, destend - dp); |
| 568 | + iter->len = len - copylen; |
| 569 | + |
| 570 | + while (off < copylen) |
| 571 | + { |
| 572 | + /* see comments in common/pg_lzcompress.c */ |
| 573 | + memcpy(dp, dp - off, off); |
| 574 | + copylen -= off; |
| 575 | + dp += off; |
| 576 | + off += off; |
| 577 | + } |
| 578 | + memcpy(dp, dp - off, copylen); |
| 579 | + dp += copylen; |
| 580 | + |
| 581 | + iter->off = off; |
| 582 | + } |
| 583 | + else |
| 584 | + { |
| 585 | + /* |
| 586 | + * An unset control bit means LITERAL BYTE. So we just copy |
| 587 | + * one from INPUT to OUTPUT. |
| 588 | + */ |
| 589 | + *dp++ = *sp++; |
| 590 | + } |
| 591 | + |
| 592 | + /* |
| 593 | + * Advance the control bit |
| 594 | + */ |
| 595 | + ctrl >>= 1; |
| 596 | + } |
| 597 | + |
| 598 | + iter->ctrlc = ctrlc; |
| 599 | + iter->ctrl = ctrl; |
| 600 | + } |
| 601 | + |
| 602 | + source->position = (char *) sp; |
| 603 | + dest->limit = (char *) dp; |
| 604 | +} |
| 605 | +#endif |
| 606 | + |
250 | 607 | /*
|
251 | 608 | * Extract compression ID from a varlena.
|
252 | 609 | *
|
|
0 commit comments