Commit a patch submitted by [email protected] to fix a race condition in

SharedQueueUnBind We found it was caused by SQueuesLock race condition between SharedQueueUnBind and SharedQueueBind. In rare situation, for example, when cluster memory is low, processes running much slower, when producer process timeout and wait on SQueuesLock, some consumers just enter SharedQueueBind and successfully attach to the shared queue. After SharedQueueBind release SQueuesLock SharedQueueUnBind just remove the shared queue from SharedQueues, and set sq_sync to NULL.When SharedQueueRead, the consumer coredump for SEGV. The fix is when SharedQueueUnBind got SQueuesLock, recheck whether there are still consumers running on the shared queue, if so, SharedQueueUnBind need to wait until no more consumers running or timeout. The patch also fix the SharedQueues search failure elog in SharedQueueBind to ERROR to avoid unnecessary cluster reinitialize.
author: Pavan Deolasee 2015-08-19 03:10:34 +0000
committer: Pavan Deolasee 2015-08-19 03:10:34 +0000
commit: 8fcbe6accde787028a608845540ab99ff78abb58 (patch)
tree: c337f3570a589e928f302196c430b0038e9ef202
parent: 2b6985afdb754282349f768e6cdf256178d04f4d (diff)
1 files changed, 36 insertions, 5 deletions
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index 41f82a7966..74cdccaaf3 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -1204,6 +1204,12 @@ SharedQueueUnBind(SharedQueue squeue)
 {
 	SQueueSync *sqsync = squeue->sq_sync;
 	int			wait_result = 0;
+	int         i                = 0;
+	int         consumer_running = 0;
+	char        *pcursor 		 = NULL;
+
+
+CHECK:
 
 	/* loop while there are active consumers */
 	for (;;)
@@ -1227,11 +1233,6 @@ SharedQueueUnBind(SharedQueue squeue)
 				/* producer will continue waiting */
 				ResetLatch(&sqsync->sqs_producer_latch);
 			}
-#ifdef SQUEUE_STAT
-			else
-				elog(LOG, "Done %s node %d, %ld writes and %ld reads, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer",
-					 squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns);
-#endif
 
 			LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
 		}
@@ -1249,8 +1250,38 @@ SharedQueueUnBind(SharedQueue squeue)
 #ifdef SQUEUE_STAT
 	elog(DEBUG1, "Producer %s is done, there were %ld pauses", squeue->sq_key, squeue->stat_paused);
 #endif
+	elog(LOG, "Producer %s is done", squeue->sq_key);
 
 	LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+
+	/*
+	 * In rear situation, after consumers just bind to the shared queue, the producer timeout and remove the shared queue.
+	 * This will cause a SEGV in the consumer. So here recheck if there are some consumers binded to the queue, if so, we need to wait them to 
+	 * finish.
+	 */
+	consumer_running = 0;
+	for (i = 0; i < squeue->sq_nconsumers; i++)
+	{
+		ConsState *cstate = &squeue->sq_consumers[i];
+
+		LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+
+		/* found a consumer running */
+		if (CONSUMER_ACTIVE == cstate->cs_status && cstate->cs_pid != 0)
+		{
+			consumer_running++;
+		}
+
+		LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+	}
+
+	if (consumer_running)
+	{
+		elog(DEBUG1, "Producer %s have %d consumers still running, recheck now", squeue->sq_key, consumer_running);
+		LWLockRelease(SQueuesLock);
+		goto CHECK;
+	}
+	
 	/* All is done, clean up */
 	DisownLatch(&sqsync->sqs_producer_latch);
author	Pavan Deolasee	2015-08-19 03:10:34 +0000
committer	Pavan Deolasee	2015-08-19 03:10:34 +0000
commit	8fcbe6accde787028a608845540ab99ff78abb58 (patch)
tree	c337f3570a589e928f302196c430b0038e9ef202
parent	2b6985afdb754282349f768e6cdf256178d04f4d (diff)