diff options
author | Pavan Deolasee | 2015-08-19 03:10:34 +0000 |
---|---|---|
committer | Pavan Deolasee | 2015-08-19 03:10:34 +0000 |
commit | 8fcbe6accde787028a608845540ab99ff78abb58 (patch) | |
tree | c337f3570a589e928f302196c430b0038e9ef202 | |
parent | 2b6985afdb754282349f768e6cdf256178d04f4d (diff) |
Commit a patch submitted by [email protected] to fix a race condition in
SharedQueueUnBind
We found it was caused by SQueuesLock race condition between SharedQueueUnBind
and SharedQueueBind. In rare situation, for example, when cluster memory is
low, processes running much slower, when producer process timeout and wait on
SQueuesLock, some consumers just enter SharedQueueBind and successfully attach
to the shared queue. After SharedQueueBind release SQueuesLock
SharedQueueUnBind just remove the shared queue from SharedQueues, and set
sq_sync to NULL.When SharedQueueRead, the consumer coredump for SEGV.
The fix is when SharedQueueUnBind got SQueuesLock, recheck whether there are
still consumers running on the shared queue, if so, SharedQueueUnBind need to
wait until no more consumers running or timeout. The patch also fix the
SharedQueues search failure elog in SharedQueueBind to ERROR to avoid
unnecessary cluster reinitialize.
-rw-r--r-- | src/backend/pgxc/squeue/squeue.c | 41 |
1 files changed, 36 insertions, 5 deletions
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c index 41f82a7966..74cdccaaf3 100644 --- a/src/backend/pgxc/squeue/squeue.c +++ b/src/backend/pgxc/squeue/squeue.c @@ -1204,6 +1204,12 @@ SharedQueueUnBind(SharedQueue squeue) { SQueueSync *sqsync = squeue->sq_sync; int wait_result = 0; + int i = 0; + int consumer_running = 0; + char *pcursor = NULL; + + +CHECK: /* loop while there are active consumers */ for (;;) @@ -1227,11 +1233,6 @@ SharedQueueUnBind(SharedQueue squeue) /* producer will continue waiting */ ResetLatch(&sqsync->sqs_producer_latch); } -#ifdef SQUEUE_STAT - else - elog(LOG, "Done %s node %d, %ld writes and %ld reads, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer", - squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns); -#endif LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } @@ -1249,8 +1250,38 @@ SharedQueueUnBind(SharedQueue squeue) #ifdef SQUEUE_STAT elog(DEBUG1, "Producer %s is done, there were %ld pauses", squeue->sq_key, squeue->stat_paused); #endif + elog(LOG, "Producer %s is done", squeue->sq_key); LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); + + /* + * In rear situation, after consumers just bind to the shared queue, the producer timeout and remove the shared queue. + * This will cause a SEGV in the consumer. So here recheck if there are some consumers binded to the queue, if so, we need to wait them to + * finish. + */ + consumer_running = 0; + for (i = 0; i < squeue->sq_nconsumers; i++) + { + ConsState *cstate = &squeue->sq_consumers[i]; + + LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); + + /* found a consumer running */ + if (CONSUMER_ACTIVE == cstate->cs_status && cstate->cs_pid != 0) + { + consumer_running++; + } + + LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); + } + + if (consumer_running) + { + elog(DEBUG1, "Producer %s have %d consumers still running, recheck now", squeue->sq_key, consumer_running); + LWLockRelease(SQueuesLock); + goto CHECK; + } + /* All is done, clean up */ DisownLatch(&sqsync->sqs_producer_latch); |