summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavan Deolasee2016-05-05 05:32:04 +0000
committerPavan Deolasee2016-10-18 10:05:07 +0000
commit82696b10ea2c9fcbb600019f604bb62daf0941b0 (patch)
tree9570d037998d648db2663fb53c1897be98fcbe5c
parent630b361df38b0066ac495f6790b317784aa8e183 (diff)
Add a ref count mechanism to deal with situations where a Shared Queue is
acquired but never bound by any of the node, thus causing leakage To be honest, this area requires further work. The way things are currently setup, producer and consumers all bind to a shared queue, but only producer eventually unbinds. The implementation has logic to wait out for consumers before destroying a shared queue. While this is okay, a more defined entry and exit points are required for producer and consumers. The code also today relies on timeouts to handle the case where a consumer never binds to a shared queue, thus causing large delays. These delays are more prominent for very short queries.
-rw-r--r--src/backend/pgxc/squeue/squeue.c44
1 files changed, 39 insertions, 5 deletions
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index 09700c1af3..0d2844aa5c 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -104,6 +104,7 @@ typedef struct SQueueHeader
int sq_pid; /* Process id of the producer session */
int sq_nodeid; /* Node id of the producer parent */
SQueueSync *sq_sync; /* Associated sinchronization objects */
+ int sq_refcnt; /* Reference count to this entry */
#ifdef SQUEUE_STAT
bool stat_finish;
long stat_paused;
@@ -287,6 +288,7 @@ tryagain:
/* Initialize the shared queue */
sq->sq_pid = 0;
sq->sq_nodeid = -1;
+ sq->sq_refcnt = 1;
#ifdef SQUEUE_STAT
sq->stat_finish = false;
sq->stat_paused = 0;
@@ -401,8 +403,8 @@ tryagain:
" %d tries", trycount);
goto tryagain;
}
-
}
+ sq->sq_refcnt++;
}
LWLockRelease(SQueuesLock);
}
@@ -521,6 +523,18 @@ SharedQueueBind(const char *sqname, List *consNodes,
if (myindex)
*myindex = -1;
+
+ /*
+ * Increment the refcnt only when producer binds. This is a bit
+ * asymmetrical, but the way things are currently setup, a consumer
+ * though calls SharedQueueBind, never calls SharedQueueUnBind. The
+ * unbinding is done only by the producer after it waits for all
+ * consumers to finish.
+ *
+ * XXX This ought to be fixed someday to simplify things in Shared
+ * Queue handling
+ */
+ sq->sq_refcnt++;
}
else
{
@@ -1433,6 +1447,15 @@ CHECK:
LWLockRelease(SQueuesLock);
goto CHECK;
}
+
+ /*
+ * XXX Decrement the refcnt, but it doesn't really matter because we are
+ * unconditionally removing the SQueue anyways. SharedQueueRelease is
+ * prepared to work with already removed SQueue
+ *
+ * This ought to be fixed someday
+ */
+ squeue->sq_refcnt--;
/* All is done, clean up */
DisownLatch(&sqsync->sqs_producer_latch);
@@ -1481,8 +1504,7 @@ SharedQueueRelease(const char *sqname)
if (sq->sq_nodeid == -1)
{
elog(DEBUG1, "SQueue %s, producer not bound ", sqname);
- LWLockRelease(SQueuesLock);
- return;
+ goto done;
}
/*
@@ -1549,8 +1571,7 @@ SharedQueueRelease(const char *sqname)
}
LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
/* exit */
- LWLockRelease(SQueuesLock);
- return;
+ goto done;
}
}
@@ -1578,6 +1599,19 @@ SharedQueueRelease(const char *sqname)
}
}
}
+done:
+ /*
+ * If we are the last holder of the SQueue, remove it from the hash table
+ * to avoid any leak
+ */
+ if (sq && --sq->sq_refcnt == 0)
+ {
+ /* Now it is OK to remove hash table entry */
+ sq->sq_sync->queue = NULL;
+ sq->sq_sync = NULL;
+ if (hash_search(SharedQueues, sq->sq_key, HASH_REMOVE, NULL) != sq)
+ elog(PANIC, "Shared queue data corruption");
+ }
LWLockRelease(SQueuesLock);
}