Skip to content

Commit f481d28

Browse files
committed
Check default partitions constraints while descending
Partitioning tuple route code assumes that the partition chosen while descending the partition hierarchy is always the correct one. This is true except when the partition is the default partition and another partition has been added concurrently: the partition constraint changes and we don't recheck it. This can lead to tuples mistakenly being added to the default partition that should have been rejected. Fix by rechecking the default partition constraint while descending the hierarchy. An isolation test based on the reproduction steps described by Hao Wu (with tweaks for extra coverage) is included. Backpatch to 12, where this bug came in with 898e5e3. Reported by: Hao Wu <[email protected]> Author: Amit Langote <[email protected]> Author: Álvaro Herrera <[email protected]> Discussion: https://postgr.es/m/CA+HiwqFqBmcSSap4sFnCBUEL_VfOMmEKaQ3gwUhyfa4c7J_-nA@mail.gmail.com Discussion: https://postgr.es/m/DM5PR0501MB3910E97A9EDFB4C775CF3D75A42F0@DM5PR0501MB3910.namprd05.prod.outlook.com
1 parent c9ae5cb commit f481d28

File tree

4 files changed

+195
-25
lines changed

4 files changed

+195
-25
lines changed

src/backend/executor/execPartition.c

+102-25
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@
5151
* PartitionDispatchData->indexes for details on how this array is
5252
* indexed.
5353
*
54+
* nonleaf_partitions
55+
* Array of 'max_dispatch' elements containing pointers to fake
56+
* ResultRelInfo objects for nonleaf partitions, useful for checking
57+
* the partition constraint.
58+
*
5459
* num_dispatch
5560
* The current number of items stored in the 'partition_dispatch_info'
5661
* array. Also serves as the index of the next free array element for
@@ -89,6 +94,7 @@ struct PartitionTupleRouting
8994
{
9095
Relation partition_root;
9196
PartitionDispatch *partition_dispatch_info;
97+
ResultRelInfo **nonleaf_partitions;
9298
int num_dispatch;
9399
int max_dispatch;
94100
ResultRelInfo **partitions;
@@ -280,9 +286,11 @@ ExecFindPartition(ModifyTableState *mtstate,
280286
PartitionDispatch dispatch;
281287
PartitionDesc partdesc;
282288
ExprContext *ecxt = GetPerTupleExprContext(estate);
283-
TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
289+
TupleTableSlot *ecxt_scantuple_saved = ecxt->ecxt_scantuple;
290+
TupleTableSlot *rootslot = slot;
284291
TupleTableSlot *myslot = NULL;
285292
MemoryContext oldcxt;
293+
ResultRelInfo *rri = NULL;
286294

287295
/* use per-tuple context here to avoid leaking memory */
288296
oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
@@ -296,27 +304,15 @@ ExecFindPartition(ModifyTableState *mtstate,
296304

297305
/* start with the root partitioned table */
298306
dispatch = pd[0];
299-
while (true)
307+
while (dispatch != NULL)
300308
{
301-
AttrMap *map = dispatch->tupmap;
302309
int partidx = -1;
303310

304311
CHECK_FOR_INTERRUPTS();
305312

306313
rel = dispatch->reldesc;
307314
partdesc = dispatch->partdesc;
308315

309-
/*
310-
* Convert the tuple to this parent's layout, if different from the
311-
* current relation.
312-
*/
313-
myslot = dispatch->tupslot;
314-
if (myslot != NULL)
315-
{
316-
Assert(map != NULL);
317-
slot = execute_attr_map_slot(map, slot, myslot);
318-
}
319-
320316
/*
321317
* Extract partition key from tuple. Expression evaluation machinery
322318
* that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
@@ -352,11 +348,9 @@ ExecFindPartition(ModifyTableState *mtstate,
352348

353349
if (partdesc->is_leaf[partidx])
354350
{
355-
ResultRelInfo *rri;
356-
357351
/*
358-
* Look to see if we've already got a ResultRelInfo for this
359-
* partition.
352+
* We've reached the leaf -- hurray, we're done. Look to see if
353+
* we've already got a ResultRelInfo for this partition.
360354
*/
361355
if (likely(dispatch->indexes[partidx] >= 0))
362356
{
@@ -400,14 +394,10 @@ ExecFindPartition(ModifyTableState *mtstate,
400394
dispatch,
401395
rootResultRelInfo, partidx);
402396
}
397+
Assert(rri != NULL);
403398

404-
/* Release the tuple in the lowest parent's dedicated slot. */
405-
if (slot == myslot)
406-
ExecClearTuple(myslot);
407-
408-
MemoryContextSwitchTo(oldcxt);
409-
ecxt->ecxt_scantuple = ecxt_scantuple_old;
410-
return rri;
399+
/* Signal to terminate the loop */
400+
dispatch = NULL;
411401
}
412402
else
413403
{
@@ -419,6 +409,8 @@ ExecFindPartition(ModifyTableState *mtstate,
419409
/* Already built. */
420410
Assert(dispatch->indexes[partidx] < proute->num_dispatch);
421411

412+
rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
413+
422414
/*
423415
* Move down to the next partition level and search again
424416
* until we find a leaf partition that matches this tuple
@@ -440,10 +432,75 @@ ExecFindPartition(ModifyTableState *mtstate,
440432
dispatch, partidx);
441433
Assert(dispatch->indexes[partidx] >= 0 &&
442434
dispatch->indexes[partidx] < proute->num_dispatch);
435+
436+
rri = proute->nonleaf_partitions[dispatch->indexes[partidx]];
443437
dispatch = subdispatch;
444438
}
439+
440+
/*
441+
* Convert the tuple to the new parent's layout, if different from
442+
* the previous parent.
443+
*/
444+
if (dispatch->tupslot)
445+
{
446+
AttrMap *map = dispatch->tupmap;
447+
TupleTableSlot *tempslot = myslot;
448+
449+
myslot = dispatch->tupslot;
450+
slot = execute_attr_map_slot(map, slot, myslot);
451+
452+
if (tempslot != NULL)
453+
ExecClearTuple(tempslot);
454+
}
455+
}
456+
457+
/*
458+
* If this partition is the default one, we must check its partition
459+
* constraint now, which may have changed concurrently due to
460+
* partitions being added to the parent.
461+
*
462+
* (We do this here, and do not rely on ExecInsert doing it, because
463+
* we don't want to miss doing it for non-leaf partitions.)
464+
*/
465+
if (partidx == partdesc->boundinfo->default_index)
466+
{
467+
PartitionRoutingInfo *partrouteinfo = rri->ri_PartitionInfo;
468+
469+
/*
470+
* The tuple must match the partition's layout for the constraint
471+
* expression to be evaluated successfully. If the partition is
472+
* sub-partitioned, that would already be the case due to the code
473+
* above, but for a leaf partition the tuple still matches the
474+
* parent's layout.
475+
*
476+
* Note that we have a map to convert from root to current
477+
* partition, but not from immediate parent to current partition.
478+
* So if we have to convert, do it from the root slot; if not, use
479+
* the root slot as-is.
480+
*/
481+
if (partrouteinfo)
482+
{
483+
TupleConversionMap *map = partrouteinfo->pi_RootToPartitionMap;
484+
485+
if (map)
486+
slot = execute_attr_map_slot(map->attrMap, rootslot,
487+
partrouteinfo->pi_PartitionTupleSlot);
488+
else
489+
slot = rootslot;
490+
}
491+
492+
ExecPartitionCheck(rri, slot, estate, true);
445493
}
446494
}
495+
496+
/* Release the tuple in the lowest parent's dedicated slot. */
497+
if (myslot != NULL)
498+
ExecClearTuple(myslot);
499+
/* and restore ecxt's scantuple */
500+
ecxt->ecxt_scantuple = ecxt_scantuple_saved;
501+
MemoryContextSwitchTo(oldcxt);
502+
503+
return rri;
447504
}
448505

449506
/*
@@ -1060,17 +1117,37 @@ ExecInitPartitionDispatchInfo(EState *estate,
10601117
proute->max_dispatch = 4;
10611118
proute->partition_dispatch_info = (PartitionDispatch *)
10621119
palloc(sizeof(PartitionDispatch) * proute->max_dispatch);
1120+
proute->nonleaf_partitions = (ResultRelInfo **)
1121+
palloc(sizeof(ResultRelInfo *) * proute->max_dispatch);
10631122
}
10641123
else
10651124
{
10661125
proute->max_dispatch *= 2;
10671126
proute->partition_dispatch_info = (PartitionDispatch *)
10681127
repalloc(proute->partition_dispatch_info,
10691128
sizeof(PartitionDispatch) * proute->max_dispatch);
1129+
proute->nonleaf_partitions = (ResultRelInfo **)
1130+
repalloc(proute->nonleaf_partitions,
1131+
sizeof(ResultRelInfo *) * proute->max_dispatch);
10701132
}
10711133
}
10721134
proute->partition_dispatch_info[dispatchidx] = pd;
10731135

1136+
/*
1137+
* If setting up a PartitionDispatch for a sub-partitioned table, we may
1138+
* also need a minimally valid ResultRelInfo for checking the partition
1139+
* constraint later; set that up now.
1140+
*/
1141+
if (parent_pd)
1142+
{
1143+
ResultRelInfo *rri = makeNode(ResultRelInfo);
1144+
1145+
InitResultRelInfo(rri, rel, 1, proute->partition_root, 0);
1146+
proute->nonleaf_partitions[dispatchidx] = rri;
1147+
}
1148+
else
1149+
proute->nonleaf_partitions[dispatchidx] = NULL;
1150+
10741151
/*
10751152
* Finally, if setting up a PartitionDispatch for a sub-partitioned table,
10761153
* install a downlink in the parent to allow quick descent.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
Parsed test spec with 2 sessions
2+
3+
starting permutation: s1b s1a s2b s2i s1c s2c s2s
4+
step s1b: begin;
5+
step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
6+
step s2b: begin;
7+
step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); <waiting ...>
8+
step s1c: commit;
9+
step s2i: <... completed>
10+
error in steps s1c s2i: ERROR: new row for relation "tpart_default" violates partition constraint
11+
step s2c: commit;
12+
step s2s: select tableoid::regclass, * from tpart;
13+
tableoid i j
14+
15+
tpart_2 110 xxx
16+
tpart_2 120 yyy
17+
tpart_2 150 zzz
18+
19+
starting permutation: s1b s1a s2b s2i2 s1c s2c s2s
20+
step s1b: begin;
21+
step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200);
22+
step s2b: begin;
23+
step s2i2: insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); <waiting ...>
24+
step s1c: commit;
25+
step s2i2: <... completed>
26+
error in steps s1c s2i2: ERROR: new row for relation "tpart_default" violates partition constraint
27+
step s2c: commit;
28+
step s2s: select tableoid::regclass, * from tpart;
29+
tableoid i j
30+
31+
tpart_2 110 xxx
32+
tpart_2 120 yyy
33+
tpart_2 150 zzz
34+
35+
starting permutation: s1b s2b s2i s1a s2c s1c s2s
36+
step s1b: begin;
37+
step s2b: begin;
38+
step s2i: insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz');
39+
step s1a: alter table tpart attach partition tpart_2 for values from (100) to (200); <waiting ...>
40+
step s2c: commit;
41+
step s1a: <... completed>
42+
error in steps s2c s1a: ERROR: updated partition constraint for default partition "tpart_default_default" would be violated by some row
43+
step s1c: commit;
44+
step s2s: select tableoid::regclass, * from tpart;
45+
tableoid i j
46+
47+
tpart_default_default110 xxx
48+
tpart_default_default120 yyy
49+
tpart_default_default150 zzz

src/test/isolation/isolation_schedule

+1
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ test: vacuum-skip-locked
8181
test: predicate-hash
8282
test: predicate-gist
8383
test: predicate-gin
84+
test: partition-concurrent-attach
8485
test: partition-key-update-1
8586
test: partition-key-update-2
8687
test: partition-key-update-3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Verify that default partition constraint is enforced correctly
2+
# in light of partitions being added concurrently to its parent
3+
setup {
4+
drop table if exists tpart;
5+
create table tpart(i int, j text) partition by range(i);
6+
create table tpart_1(like tpart);
7+
create table tpart_2(like tpart);
8+
create table tpart_default (a int, j text, i int) partition by list (j);
9+
create table tpart_default_default (a int, i int, b int, j text);
10+
alter table tpart_default_default drop b;
11+
alter table tpart_default attach partition tpart_default_default default;
12+
alter table tpart_default drop a;
13+
alter table tpart attach partition tpart_default default;
14+
alter table tpart attach partition tpart_1 for values from(0) to (100);
15+
insert into tpart_2 values (110,'xxx'), (120, 'yyy'), (150, 'zzz');
16+
}
17+
18+
session "s1"
19+
step "s1b" { begin; }
20+
step "s1a" { alter table tpart attach partition tpart_2 for values from (100) to (200); }
21+
step "s1c" { commit; }
22+
23+
session "s2"
24+
step "s2b" { begin; }
25+
step "s2i" { insert into tpart values (110,'xxx'), (120, 'yyy'), (150, 'zzz'); }
26+
step "s2i2" { insert into tpart_default (i, j) values (110, 'xxx'), (120, 'yyy'), (150, 'zzz'); }
27+
step "s2c" { commit; }
28+
step "s2s" { select tableoid::regclass, * from tpart; }
29+
30+
teardown { drop table tpart; }
31+
32+
# insert into tpart by s2 which routes to tpart_default due to not seeing
33+
# concurrently added tpart_2 should fail, because the partition constraint
34+
# of tpart_default would have changed due to tpart_2 having been added
35+
permutation "s1b" "s1a" "s2b" "s2i" "s1c" "s2c" "s2s"
36+
37+
# similar to above, but now insert into sub-partitioned tpart_default
38+
permutation "s1b" "s1a" "s2b" "s2i2" "s1c" "s2c" "s2s"
39+
40+
# reverse: now the insert into tpart_default by s2 occurs first followed by
41+
# attach in s1, which should fail when it scans the leaf default partition
42+
# find the violating rows
43+
permutation "s1b" "s2b" "s2i" "s1a" "s2c" "s1c" "s2s"

0 commit comments

Comments
 (0)