Skip to content

Commit 924bcf4

Browse files
committed
Create an infrastructure for parallel computation in PostgreSQL.
This does four basic things. First, it provides convenience routines to coordinate the startup and shutdown of parallel workers. Second, it synchronizes various pieces of state (e.g. GUCs, combo CID mappings, transaction snapshot) from the parallel group leader to the worker processes. Third, it prohibits various operations that would result in unsafe changes to that state while parallelism is active. Finally, it propagates events that would result in an ErrorResponse, NoticeResponse, or NotifyResponse message being sent to the client from the parallel workers back to the master, from which they can then be sent on to the client. Robert Haas, Amit Kapila, Noah Misch, Rushabh Lathia, Jeevan Chalke. Suggestions and review from Andres Freund, Heikki Linnakangas, Noah Misch, Simon Riggs, Euler Taveira, and Jim Nasby.
1 parent 669c7d2 commit 924bcf4

File tree

37 files changed

+2499
-47
lines changed

37 files changed

+2499
-47
lines changed

contrib/postgres_fdw/connection.c

+3
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,7 @@ pgfdw_xact_callback(XactEvent event, void *arg)
546546

547547
switch (event)
548548
{
549+
case XACT_EVENT_PARALLEL_PRE_COMMIT:
549550
case XACT_EVENT_PRE_COMMIT:
550551
/* Commit all remote transactions during pre-commit */
551552
do_sql_command(entry->conn, "COMMIT TRANSACTION");
@@ -588,11 +589,13 @@ pgfdw_xact_callback(XactEvent event, void *arg)
588589
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
589590
errmsg("cannot prepare a transaction that modified remote tables")));
590591
break;
592+
case XACT_EVENT_PARALLEL_COMMIT:
591593
case XACT_EVENT_COMMIT:
592594
case XACT_EVENT_PREPARE:
593595
/* Pre-commit should have closed the open transaction */
594596
elog(ERROR, "missed cleaning up connection during pre-commit");
595597
break;
598+
case XACT_EVENT_PARALLEL_ABORT:
596599
case XACT_EVENT_ABORT:
597600
/* Assume we might have lost track of prepared statements */
598601
entry->have_error = true;

src/backend/access/heap/heapam.c

+55
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include "access/heapam_xlog.h"
4343
#include "access/hio.h"
4444
#include "access/multixact.h"
45+
#include "access/parallel.h"
4546
#include "access/relscan.h"
4647
#include "access/sysattr.h"
4748
#include "access/transam.h"
@@ -1051,7 +1052,13 @@ relation_open(Oid relationId, LOCKMODE lockmode)
10511052

10521053
/* Make note that we've accessed a temporary relation */
10531054
if (RelationUsesLocalBuffers(r))
1055+
{
1056+
if (IsParallelWorker())
1057+
ereport(ERROR,
1058+
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
1059+
errmsg("cannot access temporary tables during a parallel operation")));
10541060
MyXactAccessedTempRel = true;
1061+
}
10551062

10561063
pgstat_initstats(r);
10571064

@@ -1097,7 +1104,13 @@ try_relation_open(Oid relationId, LOCKMODE lockmode)
10971104

10981105
/* Make note that we've accessed a temporary relation */
10991106
if (RelationUsesLocalBuffers(r))
1107+
{
1108+
if (IsParallelWorker())
1109+
ereport(ERROR,
1110+
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
1111+
errmsg("cannot access temporary tables during a parallel operation")));
11001112
MyXactAccessedTempRel = true;
1113+
}
11011114

11021115
pgstat_initstats(r);
11031116

@@ -2237,6 +2250,17 @@ static HeapTuple
22372250
heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
22382251
CommandId cid, int options)
22392252
{
2253+
/*
2254+
* For now, parallel operations are required to be strictly read-only.
2255+
* Unlike heap_update() and heap_delete(), an insert should never create
2256+
* a combo CID, so it might be possible to relax this restriction, but
2257+
* not without more thought and testing.
2258+
*/
2259+
if (IsInParallelMode())
2260+
ereport(ERROR,
2261+
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2262+
errmsg("cannot insert tuples during a parallel operation")));
2263+
22402264
if (relation->rd_rel->relhasoids)
22412265
{
22422266
#ifdef NOT_USED
@@ -2648,6 +2672,16 @@ heap_delete(Relation relation, ItemPointer tid,
26482672

26492673
Assert(ItemPointerIsValid(tid));
26502674

2675+
/*
2676+
* Forbid this during a parallel operation, lets it allocate a combocid.
2677+
* Other workers might need that combocid for visibility checks, and we
2678+
* have no provision for broadcasting it to them.
2679+
*/
2680+
if (IsInParallelMode())
2681+
ereport(ERROR,
2682+
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2683+
errmsg("cannot delete tuples during a parallel operation")));
2684+
26512685
block = ItemPointerGetBlockNumber(tid);
26522686
buffer = ReadBuffer(relation, block);
26532687
page = BufferGetPage(buffer);
@@ -3099,6 +3133,16 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
30993133

31003134
Assert(ItemPointerIsValid(otid));
31013135

3136+
/*
3137+
* Forbid this during a parallel operation, lets it allocate a combocid.
3138+
* Other workers might need that combocid for visibility checks, and we
3139+
* have no provision for broadcasting it to them.
3140+
*/
3141+
if (IsInParallelMode())
3142+
ereport(ERROR,
3143+
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3144+
errmsg("cannot update tuples during a parallel operation")));
3145+
31023146
/*
31033147
* Fetch the list of attributes to be checked for HOT update. This is
31043148
* wasted effort if we fail to update or have to put the new tuple on a
@@ -5400,6 +5444,17 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
54005444
uint32 oldlen;
54015445
uint32 newlen;
54025446

5447+
/*
5448+
* For now, parallel operations are required to be strictly read-only.
5449+
* Unlike a regular update, this should never create a combo CID, so it
5450+
* might be possible to relax this restriction, but not without more
5451+
* thought and testing. It's not clear that it would be useful, anyway.
5452+
*/
5453+
if (IsInParallelMode())
5454+
ereport(ERROR,
5455+
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
5456+
errmsg("cannot update tuples during a parallel operation")));
5457+
54035458
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
54045459
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
54055460
page = (Page) BufferGetPage(buffer);

src/backend/access/transam/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ subdir = src/backend/access/transam
1212
top_builddir = ../../../..
1313
include $(top_builddir)/src/Makefile.global
1414

15-
OBJS = clog.o commit_ts.o multixact.o rmgr.o slru.o subtrans.o \
15+
OBJS = clog.o commit_ts.o multixact.o parallel.o rmgr.o slru.o subtrans.o \
1616
timeline.o transam.o twophase.o twophase_rmgr.o varsup.o \
1717
xact.o xlog.o xlogarchive.o xlogfuncs.o \
1818
xloginsert.o xlogreader.o xlogutils.o
+223
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
Overview
2+
========
3+
4+
PostgreSQL provides some simple facilities to make writing parallel algorithms
5+
easier. Using a data structure called a ParallelContext, you can arrange to
6+
launch background worker processes, initialize their state to match that of
7+
the backend which initiated parallelism, communicate with them via dynamic
8+
shared memory, and write reasonably complex code that can run either in the
9+
user backend or in one of the parallel workers without needing to be aware of
10+
where it's running.
11+
12+
The backend which starts a parallel operation (hereafter, the initiating
13+
backend) starts by creating a dynamic shared memory segment which will last
14+
for the lifetime of the parallel operation. This dynamic shared memory segment
15+
will contain (1) a shm_mq that can be used to transport errors (and other
16+
messages reported via elog/ereport) from the worker back to the initiating
17+
backend; (2) serialized representations of the initiating backend's private
18+
state, so that the worker can synchronize its state with of the initiating
19+
backend; and (3) any other data structures which a particular user of the
20+
ParallelContext data structure may wish to add for its own purposes. Once
21+
the initiating backend has initialized the dynamic shared memory segment, it
22+
asks the postmaster to launch the appropriate number of parallel workers.
23+
These workers then connect to the dynamic shared memory segment, initiate
24+
their state, and then invoke the appropriate entrypoint, as further detailed
25+
below.
26+
27+
Error Reporting
28+
===============
29+
30+
When started, each parallel worker begins by attaching the dynamic shared
31+
memory segment and locating the shm_mq to be used for error reporting; it
32+
redirects all of its protocol messages to this shm_mq. Prior to this point,
33+
any failure of the background worker will not be reported to the initiating
34+
backend; from the point of view of the initiating backend, the worker simply
35+
failed to start. The initiating backend must anyway be prepared to cope
36+
with fewer parallel workers than it originally requested, so catering to
37+
this case imposes no additional burden.
38+
39+
Whenever a new message (or partial message; very large messages may wrap) is
40+
sent to the error-reporting queue, PROCSIG_PARALLEL_MESSAGE is sent to the
41+
initiating backend. This causes the next CHECK_FOR_INTERRUPTS() in the
42+
initiating backend to read and rethrow the message. For the most part, this
43+
makes error reporting in parallel mode "just work". Of course, to work
44+
properly, it is important that the code the initiating backend is executing
45+
CHECK_FOR_INTERRUPTS() regularly and avoid blocking interrupt processing for
46+
long periods of time, but those are good things to do anyway.
47+
48+
(A currently-unsolved problem is that some messages may get written to the
49+
system log twice, once in the backend where the report was originally
50+
generated, and again when the initiating backend rethrows the message. If
51+
we decide to suppress one of these reports, it should probably be second one;
52+
otherwise, if the worker is for some reason unable to propagate the message
53+
back to the initiating backend, the message will be lost altogether.)
54+
55+
State Sharing
56+
=============
57+
58+
It's possible to write C code which works correctly without parallelism, but
59+
which fails when parallelism is used. No parallel infrastructure can
60+
completely eliminate this problem, because any global variable is a risk.
61+
There's no general mechanism for ensuring that every global variable in the
62+
worker will have the same value that it does in the initiating backend; even
63+
if we could ensure that, some function we're calling could update the variable
64+
after each call, and only the backend where that update is performed will see
65+
the new value. Similar problems can arise with any more-complex data
66+
structure we might choose to use. For example, a pseudo-random number
67+
generator should, given a particular seed value, produce the same predictable
68+
series of values every time. But it does this by relying on some private
69+
state which won't automatically be shared between cooperating backends. A
70+
parallel-safe PRNG would need to store its state in dynamic shared memory, and
71+
would require locking. The parallelism infrastructure has no way of knowing
72+
whether the user intends to call code that has this sort of problem, and can't
73+
do anything about it anyway.
74+
75+
Instead, we take a more pragmatic approach. First, we try to make as many of
76+
the operations that are safe outside of parallel mode work correctly in
77+
parallel mode as well. Second, we try to prohibit common unsafe operations
78+
via suitable error checks. These checks are intended to catch 100% of
79+
unsafe things that a user might do from the SQL interface, but code written
80+
in C can do unsafe things that won't trigger these checks. The error checks
81+
are engaged via EnterParallelMode(), which should be called before creating
82+
a parallel context, and disarmed via ExitParallelMode(), which should be
83+
called after all parallel contexts have been destroyed. The most
84+
significant restriction imposed by parallel mode is that all operations must
85+
be strictly read-only; we allow no writes to the database and no DDL. We
86+
might try to relax these restrictions in the future.
87+
88+
To make as many operations as possible safe in parallel mode, we try to copy
89+
the most important pieces of state from the initiating backend to each parallel
90+
worker. This includes:
91+
92+
- The set of libraries dynamically loaded by dfmgr.c.
93+
94+
- The authenticated user ID and current database. Each parallel worker
95+
will connect to the same database as the initiating backend, using the
96+
same user ID.
97+
98+
- The values of all GUCs. Accordingly, permanent changes to the value of
99+
any GUC are forbidden while in parallel mode; but temporary changes,
100+
such as entering a function with non-NULL proconfig, are OK.
101+
102+
- The current subtransaction's XID, the top-level transaction's XID, and
103+
the list of XIDs considered current (that is, they are in-progress or
104+
subcommitted). This information is needed to ensure that tuple visibility
105+
checks return the same results in the worker as they do in the
106+
initiating backend. See also the section Transaction Integration, below.
107+
108+
- The combo CID mappings. This is needed to ensure consistent answers to
109+
tuple visibility checks. The need to synchronize this data structure is
110+
a major reason why we can't support writes in parallel mode: such writes
111+
might create new combo CIDs, and we have no way to let other workers
112+
(or the initiating backend) know about them.
113+
114+
- The transaction snapshot.
115+
116+
- The active snapshot, which might be different from the transaction
117+
snapshot.
118+
119+
- The currently active user ID and security context. Note that this is
120+
the fourth user ID we restore: the initial step of binding to the correct
121+
database also involves restoring the authenticated user ID. When GUC
122+
values are restored, this incidentally sets SessionUserId and OuterUserId
123+
to the correct values. This final step restores CurrentUserId.
124+
125+
To prevent undetected or unprincipled deadlocks when running in parallel mode,
126+
this could should eventually handle heavyweight locks in some way. This is
127+
not implemented yet.
128+
129+
Transaction Integration
130+
=======================
131+
132+
Regardless of what the TransactionState stack looks like in the parallel
133+
leader, each parallel worker ends up with a stack of depth 1. This stack
134+
entry is marked with the special transaction block state
135+
TBLOCK_PARALLEL_INPROGRESS so that it's not confused with an ordinary
136+
toplevel transaction. The XID of this TransactionState is set to the XID of
137+
the innermost currently-active subtransaction in the initiating backend. The
138+
initiating backend's toplevel XID, and the XIDs of all current (in-progress
139+
or subcommitted) XIDs are stored separately from the TransactionState stack,
140+
but in such a way that GetTopTransactionId(), GetTopTransactionIdIfAny(), and
141+
TransactionIdIsCurrentTransactionId() return the same values that they would
142+
in the initiating backend. We could copy the entire transaction state stack,
143+
but most of it would be useless: for example, you can't roll back to a
144+
savepoint from within a parallel worker, and there are no resources to
145+
associated with the memory contexts or resource owners of intermediate
146+
subtransactions.
147+
148+
No meaningful change to the transaction state can be made while in parallel
149+
mode. No XIDs can be assigned, and no subtransactions can start or end,
150+
because we have no way of communicating these state changes to cooperating
151+
backends, or of synchronizing them. It's clearly unworkable for the initiating
152+
backend to exit any transaction or subtransaction that was in progress when
153+
parallelism was started before all parallel workers have exited; and it's even
154+
more clearly crazy for a parallel worker to try to subcommit or subabort the
155+
current subtransaction and execute in some other transaction context than was
156+
present in the initiating backend. It might be practical to allow internal
157+
sub-transactions (e.g. to implement a PL/pgsql EXCEPTION block) to be used in
158+
parallel mode, provided that they are XID-less, because other backends
159+
wouldn't really need to know about those transactions or do anything
160+
differently because of them. Right now, we don't even allow that.
161+
162+
At the end of a parallel operation, which can happen either because it
163+
completed successfully or because it was interrupted by an error, parallel
164+
workers associated with that operation exit. In the error case, transaction
165+
abort processing in the parallel leader kills of any remaining workers, and
166+
the parallel leader then waits for them to die. In the case of a successful
167+
parallel operation, the parallel leader does not send any signals, but must
168+
wait for workers to complete and exit of their own volition. In either
169+
case, it is very important that all workers actually exit before the
170+
parallel leader cleans up the (sub)transaction in which they were created;
171+
otherwise, chaos can ensue. For example, if the leader is rolling back the
172+
transaction that created the relation being scanned by a worker, the
173+
relation could disappear while the worker is still busy scanning it. That's
174+
not safe.
175+
176+
Generally, the cleanup performed by each worker at this point is similar to
177+
top-level commit or abort. Each backend has its own resource owners: buffer
178+
pins, catcache or relcache reference counts, tuple descriptors, and so on
179+
are managed separately by each backend, and must free them before exiting.
180+
There are, however, some important differences between parallel worker
181+
commit or abort and a real top-level transaction commit or abort. Most
182+
importantly:
183+
184+
- No commit or abort record is written; the initiating backend is
185+
responsible for this.
186+
187+
- Cleanup of pg_temp namespaces is not done. Parallel workers cannot
188+
safely access the initiating backend's pg_temp namespace, and should
189+
not create one of their own.
190+
191+
Coding Conventions
192+
===================
193+
194+
Before beginning any parallel operation, call EnterParallelMode(); after all
195+
parallel operations are completed, call ExitParallelMode(). To actually
196+
parallelize a particular operation, use a ParallelContext. The basic coding
197+
pattern looks like this:
198+
199+
EnterParallelMode(); /* prohibit unsafe state changes */
200+
201+
pcxt = CreateParallelContext(entrypoint, nworkers);
202+
203+
/* Allow space for application-specific data here. */
204+
shm_toc_estimate_chunk(&pcxt->estimator, size);
205+
shm_toc_estimate_keys(&pcxt->estimator, keys);
206+
207+
InitializeParallelDSM(pcxt); /* create DSM and copy state to it */
208+
209+
/* Store the data for which we reserved space. */
210+
space = shm_toc_allocate(pcxt->toc, size);
211+
shm_toc_insert(pcxt->toc, key, space);
212+
213+
LaunchParallelWorkers(pcxt);
214+
215+
/* do parallel stuff */
216+
217+
WaitForParallelWorkersToFinish(pcxt);
218+
219+
/* read any final results from dynamic shared memory */
220+
221+
DestroyParallelContext(pcxt);
222+
223+
ExitParallelMode();

0 commit comments

Comments
 (0)