/*------------------------------------------------------------------------- * * squeue.c * * Shared queue is for data exchange in shared memory between sessions, * one of which is a producer, providing data rows. Others are consumer agents - * sessions initiated from other datanodes, the main purpose of them is to read * rows from the shared queue and send then to the parent data node. * The producer is usually a consumer at the same time, it sends back tuples * to the parent node without putting it to the queue. * * Copyright (c) 2012-2014, TransLattice, Inc. * * IDENTIFICATION * $$ * * *------------------------------------------------------------------------- */ #include #include "postgres.h" #include "miscadmin.h" #include "access/gtm.h" #include "catalog/pgxc_node.h" #include "commands/prepare.h" #include "executor/executor.h" #include "nodes/pg_list.h" #include "pgxc/nodemgr.h" #include "pgxc/pgxc.h" #include "pgxc/pgxcnode.h" #include "pgxc/squeue.h" #include "storage/latch.h" #include "storage/lwlock.h" #include "storage/shmem.h" #include "utils/hsearch.h" #include "utils/resowner.h" #include "pgstat.h" int NSQueues = 64; int SQueueSize = 64; #define LONG_TUPLE -42 typedef struct ConsumerSync { LWLock *cs_lwlock; /* Synchronize access to the consumer queue */ Latch cs_latch; /* The latch consumer is waiting on */ } ConsumerSync; /* * Shared memory structure to store synchronization info to access shared queues */ typedef struct SQueueSync { void *queue; /* NULL if not assigned to any queue */ LWLock *sqs_producer_lwlock; /* Synchronize access to the queue */ Latch sqs_producer_latch; /* the latch producer is waiting on */ ConsumerSync sqs_consumer_sync[0]; /* actual length is MaxDataNodes-1 is * not known on compile time */ } SQueueSync; /* Both producer and consumer are working */ #define CONSUMER_ACTIVE 0 /* Producer have finished work successfully and waits for consumer */ #define CONSUMER_EOF 1 /* Producer encountered error and waits for consumer to disconnect */ #define CONSUMER_ERROR 2 /* Consumer is finished with the query, OK to unbind */ #define CONSUMER_DONE 3 /* State of a single consumer */ typedef struct { int cs_pid; /* Process id of the consumer session */ int cs_node; /* Node id of the consumer parent */ /* * Queue state. The queue is a cyclic queue where stored tuples in the * DataRow format, first goes the lengths of the tuple in host format, * because it never sent over network followed by tuple bytes. */ int cs_ntuples; /* Number of tuples in the queue */ int cs_status; /* See CONSUMER_* defines above */ char *cs_qstart; /* Where consumer queue begins */ int cs_qlength; /* The size of the consumer queue */ int cs_qreadpos; /* The read position in the consumer queue */ int cs_qwritepos; /* The write position in the consumer queue */ #ifdef SQUEUE_STAT long stat_writes; long stat_reads; long stat_buff_writes; long stat_buff_reads; long stat_buff_returns; #endif } ConsState; /* Shared queue header */ typedef struct SQueueHeader { char sq_key[SQUEUE_KEYSIZE]; /* Hash entry key should be at the * beginning of the hash entry */ int sq_pid; /* Process id of the producer session */ int sq_nodeid; /* Node id of the producer parent */ SQueueSync *sq_sync; /* Associated sinchronization objects */ int sq_refcnt; /* Reference count to this entry */ #ifdef SQUEUE_STAT bool stat_finish; long stat_paused; #endif int sq_nconsumers; /* Number of consumers */ ConsState sq_consumers[0];/* variable length array */ } SQueueHeader; /* * Hash table where all shared queues are stored. Key is the queue name, value * is SharedQueue */ static HTAB *SharedQueues = NULL; static LWLockPadded *SQueueLocks = NULL; /* * Pool of synchronization items */ static void *SQueueSyncs; #define SQUEUE_SYNC_SIZE \ (sizeof(SQueueSync) + (MaxDataNodes-1) * sizeof(ConsumerSync)) #define GET_SQUEUE_SYNC(idx) \ ((SQueueSync *) (((char *) SQueueSyncs) + (idx) * SQUEUE_SYNC_SIZE)) #define SQUEUE_HDR_SIZE(nconsumers) \ (sizeof(SQueueHeader) + (nconsumers) * sizeof(ConsState)) #define QUEUE_FREE_SPACE(cstate) \ ((cstate)->cs_ntuples > 0 ? \ ((cstate)->cs_qreadpos >= (cstate)->cs_qwritepos ? \ (cstate)->cs_qreadpos - (cstate)->cs_qwritepos : \ (cstate)->cs_qlength + (cstate)->cs_qreadpos \ - (cstate)->cs_qwritepos) \ : (cstate)->cs_qlength) #define QUEUE_WRITE(cstate, len, buf) \ do \ { \ if ((cstate)->cs_qwritepos + (len) <= (cstate)->cs_qlength) \ { \ memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, len); \ (cstate)->cs_qwritepos += (len); \ if ((cstate)->cs_qwritepos == (cstate)->cs_qlength) \ (cstate)->cs_qwritepos = 0; \ } \ else \ { \ int part = (cstate)->cs_qlength - (cstate)->cs_qwritepos; \ memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, part); \ (cstate)->cs_qwritepos = (len) - part; \ memcpy((cstate)->cs_qstart, (buf) + part, (cstate)->cs_qwritepos); \ } \ } while(0) #define QUEUE_READ(cstate, len, buf) \ do \ { \ if ((cstate)->cs_qreadpos + (len) <= (cstate)->cs_qlength) \ { \ memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, len); \ (cstate)->cs_qreadpos += (len); \ if ((cstate)->cs_qreadpos == (cstate)->cs_qlength) \ (cstate)->cs_qreadpos = 0; \ } \ else \ { \ int part = (cstate)->cs_qlength - (cstate)->cs_qreadpos; \ memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, part); \ (cstate)->cs_qreadpos = (len) - part; \ memcpy((buf) + part, (cstate)->cs_qstart, (cstate)->cs_qreadpos); \ } \ } while(0) static bool sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow); static void sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow, int consumerIdx, SQueueSync *sqsync); /* * SharedQueuesInit * Initialize the reference on the shared memory hash table where all shared * queues are stored. Invoked during postmaster initialization. */ void SharedQueuesInit(void) { HASHCTL info; int hash_flags; bool found; info.keysize = SQUEUE_KEYSIZE; info.entrysize = SQUEUE_SIZE; /* * Create hash table of fixed size to avoid running out of * SQueueSyncs */ hash_flags = HASH_ELEM | HASH_FIXED_SIZE; SharedQueues = ShmemInitHash("Shared Queues", NUM_SQUEUES, NUM_SQUEUES, &info, hash_flags); /* * Synchronization stuff is in separate structure because we need to * initialize all items now while in the postmaster. * The structure is actually an array, each array entry is assigned to * each instance of SharedQueue in use. */ SQueueSyncs = ShmemInitStruct("Shared Queues Sync", SQUEUE_SYNC_SIZE * NUM_SQUEUES, &found); if (!found) { int i, l; int nlocks = (NUM_SQUEUES * (MaxDataNodes)); /* * (MaxDataNodes - 1) * consumers + 1 producer */ bool foundLocks; /* Initialize LWLocks for queues */ SQueueLocks = (LWLockPadded *) ShmemInitStruct("Shared Queue Locks", sizeof(LWLockPadded) * nlocks, &foundLocks); /* either both syncs and locks, or none of them */ Assert(! foundLocks); /* Register the trannche tranche in the main tranches array */ LWLockRegisterTranche(LWTRANCHE_SHARED_QUEUES, "Shared Queue Locks"); l = 0; for (i = 0; i < NUM_SQUEUES; i++) { SQueueSync *sqs = GET_SQUEUE_SYNC(i); int j; sqs->queue = NULL; LWLockInitialize(&(SQueueLocks[l]).lock, LWTRANCHE_SHARED_QUEUES); sqs->sqs_producer_lwlock = &(SQueueLocks[l++]).lock; InitSharedLatch(&sqs->sqs_producer_latch); for (j = 0; j < MaxDataNodes-1; j++) { InitSharedLatch(&sqs->sqs_consumer_sync[j].cs_latch); LWLockInitialize(&(SQueueLocks[l]).lock, LWTRANCHE_SHARED_QUEUES); sqs->sqs_consumer_sync[j].cs_lwlock = &(SQueueLocks[l++]).lock; } } } } Size SharedQueueShmemSize(void) { Size sqs_size; sqs_size = mul_size(NUM_SQUEUES, SQUEUE_SYNC_SIZE); return add_size(sqs_size, hash_estimate_size(NUM_SQUEUES, SQUEUE_SIZE)); } /* * SharedQueueAcquire * Reserve a named shared queue for future data exchange between processes * supplying tuples to remote Datanodes. Invoked when a remote query plan is * registered on the Datanode. The number of consumers is known at this point, * so shared queue may be formatted during reservation. The first process that * is acquiring the shared queue on the Datanode does the formatting. */ void SharedQueueAcquire(const char *sqname, int ncons) { bool found; SharedQueue sq; int trycount = 0; Assert(IsConnFromDatanode()); Assert(ncons > 0); tryagain: LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); /* * Setup PGXC_PARENT_NODE_ID right now to ensure that the cleanup happens * correctly even if the consumer never really binds to the shared queue. */ PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, &PGXC_PARENT_NODE_TYPE); sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_ENTER, &found); if (!sq) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("out of shared queue, please increase shared_queues"))); /* First process acquiring queue should format it */ if (!found) { int qsize; /* Size of one queue */ int i; char *heapPtr; elog(DEBUG1, "Create a new SQueue %s and format it for %d consumers", sqname, ncons); /* Initialize the shared queue */ sq->sq_pid = 0; sq->sq_nodeid = -1; sq->sq_refcnt = 1; #ifdef SQUEUE_STAT sq->stat_finish = false; sq->stat_paused = 0; #endif /* * Assign sync object (latches to wait on) * XXX We may want to optimize this and do smart search instead of * iterating the array. */ for (i = 0; i < NUM_SQUEUES; i++) { SQueueSync *sqs = GET_SQUEUE_SYNC(i); if (sqs->queue == NULL) { sqs->queue = (void *) sq; sq->sq_sync = sqs; break; } } Assert(sq->sq_sync != NULL); sq->sq_nconsumers = ncons; /* Determine queue size for a single consumer */ qsize = (SQUEUE_SIZE - SQUEUE_HDR_SIZE(sq->sq_nconsumers)) / sq->sq_nconsumers; heapPtr = (char *) sq; /* Skip header */ heapPtr += SQUEUE_HDR_SIZE(sq->sq_nconsumers); /* Set up consumer queues */ for (i = 0; i < ncons; i++) { ConsState *cstate = &(sq->sq_consumers[i]); cstate->cs_pid = 0; cstate->cs_node = -1; cstate->cs_ntuples = 0; cstate->cs_status = CONSUMER_ACTIVE; cstate->cs_qstart = heapPtr; cstate->cs_qlength = qsize; cstate->cs_qreadpos = 0; cstate->cs_qwritepos = 0; heapPtr += qsize; } Assert(heapPtr <= ((char *) sq) + SQUEUE_SIZE); } else { int i; elog(DEBUG1, "Found an existing SQueue %s - (sq_pid:%d, sq_nodeid:%d," " sq_nconsumers:%d", sqname, sq->sq_pid, sq->sq_nodeid, sq->sq_nconsumers); for (i = 0; i < sq->sq_nconsumers; i++) { elog(DEBUG1, "SQueue %s, consumer (%d) information (cs_pid:%d," " cs_node:%d, cs_ntuples:%d, cs_status: %d", sqname, i, sq->sq_consumers[i].cs_pid, sq->sq_consumers[i].cs_node, sq->sq_consumers[i].cs_ntuples, sq->sq_consumers[i].cs_status); } /* * A race condition is possible here. The previous operation might use * the same Shared Queue name if that was different execution of the * same Portal. So here we should try to determine if that Shared Queue * belongs to this execution or that is not-yet-released Shared Queue * of previous operation. * Though at the moment I am not sure, but I believe the BIND stage is * only happening after completion of ACQUIRE stage, so it is enough * to verify the producer (the very first node that binds) is not bound * yet. If it is bound, sleep for a moment and try again. No reason to * sleep longer, the producer needs just a quantum of CPU time to UNBIND * itself. */ if (sq->sq_pid != 0) { int i; bool old_squeue = true; for (i = 0; i < sq->sq_nconsumers; i++) { ConsState *cstate = &(sq->sq_consumers[i]); if (cstate->cs_node == PGXC_PARENT_NODE_ID) { SQueueSync *sqsync = sq->sq_sync; LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); /* verify status */ if (cstate->cs_status != CONSUMER_DONE) old_squeue = false; LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); break; } } if (old_squeue) { LWLockRelease(SQueuesLock); pg_usleep(1000000L); elog(DEBUG1, "SQueue race condition, give the old producer to " "finish the work and retry again"); trycount++; if (trycount >= 10) elog(ERROR, "Couldn't resolve SQueue race condition after" " %d tries", trycount); goto tryagain; } } sq->sq_refcnt++; } LWLockRelease(SQueuesLock); } /* * SharedQueueBind * Bind to the shared queue specified by sqname either as a consumer or as a * producer. The first process that binds to the shared queue becomes a producer * and receives the consumer map, others become consumers and receive queue * indexes to read tuples from. * The consNodes int list identifies the nodes involved in the current step. * The distNodes int list describes result distribution of the current step. * The consNodes should be a subset of distNodes. * The myindex and consMap parameters are binding results. If caller process * is bound to the query as a producer myindex is set to -1 and index of the * each consumer (order number in the consNodes) is stored to the consMap array * at the position of the node in the distNodes. For the producer node * SQ_CONS_SELF is stored, nodes from distNodes list which are not members of * consNodes or if it was reported they won't read results, they are represented * as SQ_CONS_NONE. */ SharedQueue SharedQueueBind(const char *sqname, List *consNodes, List *distNodes, int *myindex, int *consMap) { bool found; SharedQueue sq; LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); PGXC_PARENT_NODE_ID = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, &PGXC_PARENT_NODE_TYPE); sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found); /* * It's not clear but it seems that if the producer fails even before a * consumer binds to the shared queue, the producer may remove the shared * queue (or would refcount mechanism fully protect us against that?). So * instead of panicing, just throw a soft error. */ if (!found) elog(ERROR, "Shared queue %s not found", sqname); /* * Now acquire the queue-specific lock and then release the top level lock. * We must follow a strict ordering between SQueuesLock, * sqs_producer_lwlock and the consumer cs_lwlock to avoid a deadlock. */ LWLockAcquire(sq->sq_sync->sqs_producer_lwlock, LW_EXCLUSIVE); LWLockRelease(SQueuesLock); if (sq->sq_pid == 0) { /* Producer */ int i; ListCell *lc; Assert(consMap); elog(DEBUG1, "Bind node %s to squeue of step %s as a producer", PGXC_PARENT_NODE, sqname); /* Initialize the shared queue */ sq->sq_pid = MyProcPid; sq->sq_nodeid = PGXC_PARENT_NODE_ID; OwnLatch(&sq->sq_sync->sqs_producer_latch); i = 0; foreach(lc, distNodes) { int nodeid = lfirst_int(lc); /* * Producer won't go to shared queue to hand off tuple to itself, * so we do not need to create queue for that entry. */ if (nodeid == PGXC_PARENT_NODE_ID) { /* Producer must be in the consNodes list */ Assert(list_member_int(consNodes, nodeid)); elog(DEBUG1, "SQueue %s consumer @%d is set to self", sqname, i); consMap[i++] = SQ_CONS_SELF; } /* * This node may connect as a consumer, store consumer id to the map * and initialize consumer queue */ else if (list_member_int(consNodes, nodeid)) { ConsState *cstate; int j; for (j = 0; j < sq->sq_nconsumers; j++) { cstate = &(sq->sq_consumers[j]); if (cstate->cs_node == nodeid) { /* The process already reported that queue won't read */ elog(DEBUG1, "Node %d of SQueue %s is released already " "at consumer %d, cs_status %d", nodeid, sqname, j, cstate->cs_status); consMap[i++] = SQ_CONS_NONE; break; } else if (cstate->cs_node == -1) { /* found unused slot, assign the consumer to it */ elog(DEBUG1, "Node %d of SQueue %s is bound at consumer " "%d, cs_status %d", nodeid, sqname, j, cstate->cs_status); consMap[i++] = j; cstate->cs_node = nodeid; break; } } } /* * Consumer from this node won't ever connect as upper level step * is not executed on the node. Discard resuls that may go to that * node, if any. */ else { elog(DEBUG1, "Node %d of SQueue %s is not in the " "redistribution list and hence would never connect", nodeid, sqname); consMap[i++] = SQ_CONS_NONE; } } if (myindex) *myindex = -1; /* * Increment the refcnt only when producer binds. This is a bit * asymmetrical, but the way things are currently setup, a consumer * though calls SharedQueueBind, never calls SharedQueueUnBind. The * unbinding is done only by the producer after it waits for all * consumers to finish. * * XXX This ought to be fixed someday to simplify things in Shared * Queue handling */ sq->sq_refcnt++; } else { int nconsumers; ListCell *lc; /* Producer should be different process */ Assert(sq->sq_pid != MyProcPid); elog(DEBUG1, "SQueue %s has a bound producer from node %d, pid %d", sqname, sq->sq_nodeid, sq->sq_pid); elog(DEBUG1, "Bind node %s to SQueue %s as a consumer %d", PGXC_PARENT_NODE, sqname, sq->sq_pid); /* Sanity checks */ Assert(myindex); *myindex = -1; /* Ensure the passed in consumer list matches the queue */ nconsumers = 0; foreach (lc, consNodes) { int nodeid = lfirst_int(lc); int i; if (nodeid == sq->sq_nodeid) { /* * This node is a producer it should be in the consumer list, * but no consumer queue for it */ continue; } /* find consumer queue for the node */ for (i = 0; i < sq->sq_nconsumers; i++) { ConsState *cstate = &(sq->sq_consumers[i]); if (cstate->cs_node == nodeid) { nconsumers++; if (nodeid == PGXC_PARENT_NODE_ID) { /* * Current consumer queue is that from which current * session will be sending out data rows. * Initialize the queue to let producer know we are * here and runnng. */ SQueueSync *sqsync = sq->sq_sync; elog(DEBUG1, "SQueue %s, consumer node %d is same as " "the parent node", sqname, nodeid); LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); /* Make sure no consumer bound to the queue already */ Assert(cstate->cs_pid == 0); /* make sure the queue is ready to read */ Assert(cstate->cs_qlength > 0); /* verify status */ if (cstate->cs_status == CONSUMER_ERROR || cstate->cs_status == CONSUMER_DONE) { int status = cstate->cs_status; /* * Producer failed by the time the consumer connect. * Change status to "Done" to allow producer unbind * and report problem to the parent. */ cstate->cs_status = CONSUMER_DONE; /* Producer may be waiting for status change */ SetLatch(&sqsync->sqs_producer_latch); LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); LWLockRelease(sqsync->sqs_producer_lwlock); ereport(ERROR, (errcode(ERRCODE_PRODUCER_ERROR), errmsg("Producer failed while we were waiting - status was %d", status))); } /* * Any other status is acceptable. Normally it would be * ACTIVE. If producer have had only few rows to emit * and it is already done the status would be EOF. */ /* Set up the consumer */ cstate->cs_pid = MyProcPid; elog(DEBUG1, "SQueue %s, consumer at %d, status %d - " "setting up consumer node %d, pid %d", sqname, i, cstate->cs_status, cstate->cs_node, cstate->cs_pid); /* return found index */ *myindex = i; OwnLatch(&sqsync->sqs_consumer_sync[i].cs_latch); LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } else elog(DEBUG1, "SQueue %s, consumer node %d is not same as " "the parent node %d", sqname, nodeid, PGXC_PARENT_NODE_ID); break; } } /* Check if entry was found and therefore loop was broken */ Assert(i < sq->sq_nconsumers); } /* Check the consumer is found */ Assert(*myindex != -1); Assert(sq->sq_nconsumers == nconsumers); } LWLockRelease(sq->sq_sync->sqs_producer_lwlock); return sq; } /* * Push data from the local tuplestore to the queue for specified consumer. * Return true if succeeded and the tuplestore is now empty. Return false * if specified queue has not enough room for the next tuple. */ static bool SharedQueueDump(SharedQueue squeue, int consumerIdx, TupleTableSlot *tmpslot, Tuplestorestate *tuplestore) { ConsState *cstate = &(squeue->sq_consumers[consumerIdx]); elog(DEBUG3, "Dumping SQueue %s data for consumer at %d, " "producer - node %d, pid %d, " "consumer - node %d, pid %d, status %d", squeue->sq_key, consumerIdx, squeue->sq_nodeid, squeue->sq_pid, cstate->cs_node, cstate->cs_pid, cstate->cs_status); /* discard stored data if consumer is not active */ if (cstate->cs_status != CONSUMER_ACTIVE) { elog(DEBUG3, "Discarding SQueue %s data for consumer at %d not active", squeue->sq_key, consumerIdx); tuplestore_clear(tuplestore); return true; } /* * Tuplestore does not clear eof flag on the active read pointer, causing * the store is always in EOF state once reached when there is a single * read pointer. We do not want behavior like this and workaround by using * secondary read pointer. Primary read pointer (0) is active when we are * writing to the tuple store, also it is used to bookmark current position * when reading to be able to roll back and return just read tuple back to * the store if we failed to write it out to the queue. * Secondary read pointer is for reading, and its eof flag is cleared if a * tuple is written to the store. */ tuplestore_select_read_pointer(tuplestore, 1); /* If we have something in the tuplestore try to push this to the queue */ while (!tuplestore_ateof(tuplestore)) { /* save position */ tuplestore_copy_read_pointer(tuplestore, 1, 0); /* Try to get next tuple to the temporary slot */ if (!tuplestore_gettupleslot(tuplestore, true, false, tmpslot)) { /* false means the tuplestore in EOF state */ elog(DEBUG3, "Tuplestore for SQueue %s returned EOF", squeue->sq_key); break; } #ifdef SQUEUE_STAT cstate->stat_buff_reads++; #endif /* The slot should contain a data row */ Assert(tmpslot->tts_datarow); /* check if queue has enough room for the data */ if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + tmpslot->tts_datarow->msglen) { /* * If stored tuple does not fit empty queue we are entering special * procedure of pushing it through. */ if (cstate->cs_ntuples <= 0) { /* * If pushing throw is completed wake up and proceed to next * tuple, there could be enough space in the consumer queue to * fit more. */ bool done = sq_push_long_tuple(cstate, tmpslot->tts_datarow); /* * sq_push_long_tuple writes some data anyway, so wake up * the consumer. */ SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch); if (done) continue; } /* Restore read position to get same tuple next time */ tuplestore_copy_read_pointer(tuplestore, 0, 1); #ifdef SQUEUE_STAT cstate->stat_buff_returns++; #endif /* We might advance the mark, try to truncate */ tuplestore_trim(tuplestore); /* Prepare for writing, set proper read pointer */ tuplestore_select_read_pointer(tuplestore, 0); /* ... and exit */ return false; } else { /* Enqueue data */ QUEUE_WRITE(cstate, sizeof(int), (char *) &tmpslot->tts_datarow->msglen); QUEUE_WRITE(cstate, tmpslot->tts_datarow->msglen, tmpslot->tts_datarow->msg); /* Increment tuple counter. If it was 0 consumer may be waiting for * data so try to wake it up */ if ((cstate->cs_ntuples)++ == 0) SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch); } } /* Remove rows we have just read */ tuplestore_trim(tuplestore); /* prepare for writes, set read pointer 0 as active */ tuplestore_select_read_pointer(tuplestore, 0); return true; } /* * SharedQueueWrite * Write data from the specified slot to the specified queue. If the * tuplestore passed in has tuples try and write them first. * If specified queue is full the tuple is put into the tuplestore which is * created if necessary */ void SharedQueueWrite(SharedQueue squeue, int consumerIdx, TupleTableSlot *slot, Tuplestorestate **tuplestore, MemoryContext tmpcxt) { ConsState *cstate = &(squeue->sq_consumers[consumerIdx]); SQueueSync *sqsync = squeue->sq_sync; LWLockId clwlock = sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock; RemoteDataRow datarow; bool free_datarow; Assert(cstate->cs_qlength > 0); LWLockAcquire(clwlock, LW_EXCLUSIVE); #ifdef SQUEUE_STAT cstate->stat_writes++; #endif /* * If we have anything in the local storage try to dump this first, * but do not try to dump often to avoid overhead of creating temporary * tuple slot. It should be OK to dump if queue is half empty. */ if (*tuplestore) { bool dumped = false; if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2) { TupleTableSlot *tmpslot; tmpslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor); dumped = SharedQueueDump(squeue, consumerIdx, tmpslot, *tuplestore); ExecDropSingleTupleTableSlot(tmpslot); } if (!dumped) { /* No room to even dump local store, append the tuple to the store * and exit */ #ifdef SQUEUE_STAT cstate->stat_buff_writes++; #endif LWLockRelease(clwlock); tuplestore_puttupleslot(*tuplestore, slot); return; } } /* Get datarow from the tuple slot */ if (slot->tts_datarow) { /* * The function ExecCopySlotDatarow always make a copy, but here we * can optimize and avoid copying the data, so we just get the reference */ datarow = slot->tts_datarow; free_datarow = false; } else { datarow = ExecCopySlotDatarow(slot, tmpcxt); free_datarow = true; } if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + datarow->msglen) { /* Not enough room, store tuple locally */ LWLockRelease(clwlock); /* clean up */ if (free_datarow) pfree(datarow); /* Create tuplestore if does not exist */ if (*tuplestore == NULL) { int ptrno; char storename[128]; #ifdef SQUEUE_STAT elog(DEBUG1, "Start buffering %s node %d, %d tuples in queue, %ld writes and %ld reads so far", squeue->sq_key, cstate->cs_node, cstate->cs_ntuples, cstate->stat_writes, cstate->stat_reads); #endif *tuplestore = tuplestore_begin_datarow(false, work_mem, tmpcxt); /* We need to be able to remember/restore the read position. */ snprintf(storename, 128, "%s node %d", squeue->sq_key, cstate->cs_node); tuplestore_collect_stat(*tuplestore, storename); /* * Allocate a second read pointer to read from the store. We know * it must have index 1, so needn't store that. */ ptrno = tuplestore_alloc_read_pointer(*tuplestore, 0); Assert(ptrno == 1); } #ifdef SQUEUE_STAT cstate->stat_buff_writes++; #endif /* Append the slot to the store... */ tuplestore_puttupleslot(*tuplestore, slot); /* ... and exit */ return; } else { /* do not supply data to closed consumer */ if (cstate->cs_status == CONSUMER_ACTIVE) { elog(DEBUG3, "SQueue %s, consumer is active, writing data", squeue->sq_key); /* write out the data */ QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen); QUEUE_WRITE(cstate, datarow->msglen, datarow->msg); /* Increment tuple counter. If it was 0 consumer may be waiting for * data so try to wake it up */ if ((cstate->cs_ntuples)++ == 0) SetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch); } else elog(DEBUG2, "SQueue %s, consumer is not active, no need to supply data", squeue->sq_key); /* clean up */ if (free_datarow) pfree(datarow); } LWLockRelease(clwlock); } /* * SharedQueueRead * Read one data row from the specified queue into the provided tupleslot. * Returns true if EOF is reached on the specified consumer queue. * If the queue is empty, behavior is controlled by the canwait parameter. * If canwait is true it is waiting while row is available or EOF or error is * reported, if it is false, the slot is emptied and false is returned. */ bool SharedQueueRead(SharedQueue squeue, int consumerIdx, TupleTableSlot *slot, bool canwait) { ConsState *cstate = &(squeue->sq_consumers[consumerIdx]); SQueueSync *sqsync = squeue->sq_sync; RemoteDataRow datarow; int datalen; Assert(cstate->cs_qlength > 0); /* * If we run out of produced data while reading, we would like to wake up * and tell the producer to produce more. But in order to ensure that the * producer does not miss the signal, we must obtain sufficient lock on the * queue. In order to allow multiple consumers to read from their * respective queues at the same time, we obtain a SHARED lock on the * queue. But the producer must obtain an EXCLUSIVE lock to ensure it does * not miss the signal. * * Again, important to follow strict lock ordering. */ LWLockAcquire(sqsync->sqs_producer_lwlock, LW_SHARED); LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE); Assert(cstate->cs_status != CONSUMER_DONE); while (cstate->cs_ntuples <= 0) { elog(DEBUG3, "SQueue %s, consumer node %d, pid %d, status %d - " "no tuples in the queue", squeue->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status); if (cstate->cs_status == CONSUMER_EOF) { elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d - " "EOF marked. Informing produer by setting CONSUMER_DONE", squeue->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status); /* Inform producer the consumer have done the job */ cstate->cs_status = CONSUMER_DONE; /* no need to receive notifications */ DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch); /* producer done the job and no more rows expected, clean up */ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); ExecClearTuple(slot); /* * notify the producer, it may be waiting while consumers * are finishing */ SetLatch(&sqsync->sqs_producer_latch); LWLockRelease(sqsync->sqs_producer_lwlock); return true; } else if (cstate->cs_status == CONSUMER_ERROR) { elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d - " "CONSUMER_ERROR set", squeue->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status); /* * There was a producer error while waiting. * Release all the locks and report problem to the caller. */ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); LWLockRelease(sqsync->sqs_producer_lwlock); /* * Reporting error will cause transaction rollback and clean up of * all portals. We can not mark the portal so it does not access * the queue so we should hold it for now. We should prevent queue * unbound in between. */ ereport(ERROR, (errcode(ERRCODE_PRODUCER_ERROR), errmsg("Failed to read from SQueue %s, " "consumer (node %d, pid %d, status %d) - " "CONSUMER_ERROR set", squeue->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status))); } if (canwait) { /* Prepare waiting on empty buffer */ ResetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch); LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - " "no queued tuples to read, waiting " "for producer to produce more data", squeue->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status); /* Inform the producer to produce more while we wait for it */ SetLatch(&sqsync->sqs_producer_latch); LWLockRelease(sqsync->sqs_producer_lwlock); /* Wait for notification about available info */ WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1, WAIT_EVENT_MQ_INTERNAL); /* got the notification, restore lock and try again */ LWLockAcquire(sqsync->sqs_producer_lwlock, LW_SHARED); LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE); } else { LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); LWLockRelease(sqsync->sqs_producer_lwlock); elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - " "no queued tuples to read, caller can't wait ", squeue->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status); ExecClearTuple(slot); return false; } } elog(DEBUG3, "SQueue %s, consumer (node %d, pid %d, status %d) - " "%d queued tuples to read", squeue->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status, cstate->cs_ntuples); /* have at least one row, read it in and store to slot */ QUEUE_READ(cstate, sizeof(int), (char *) (&datalen)); datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + datalen); datarow->msgnode = InvalidOid; datarow->msglen = datalen; if (datalen > cstate->cs_qlength - sizeof(int)) sq_pull_long_tuple(cstate, datarow, consumerIdx, sqsync); else QUEUE_READ(cstate, datalen, datarow->msg); ExecStoreDataRowTuple(datarow, slot, true); (cstate->cs_ntuples)--; #ifdef SQUEUE_STAT cstate->stat_reads++; #endif /* sanity check */ Assert((cstate->cs_ntuples == 0) == (cstate->cs_qreadpos == cstate->cs_qwritepos)); LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); LWLockRelease(sqsync->sqs_producer_lwlock); return false; } /* * Mark specified consumer as closed discarding all input which may already be * in the queue. * If consumerIdx is -1 the producer is cleaned up. Producer need to wait for * consumers before releasing the queue, so if there are yet active consumers, * they are notified about the problem and they should disconnect from the * queue as soon as possible. */ void SharedQueueReset(SharedQueue squeue, int consumerIdx) { SQueueSync *sqsync = squeue->sq_sync; /* * We may have already cleaned up, but then an abort signalled us to clean up. * Avoid segmentation fault on abort */ if (!sqsync) return; LWLockAcquire(sqsync->sqs_producer_lwlock, LW_SHARED); if (consumerIdx == -1) { int i; elog(DEBUG1, "SQueue %s, requested to reset producer node %d, pid %d - " "Now also resetting all consumers", squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid); /* check queue states */ for (i = 0; i < squeue->sq_nconsumers; i++) { ConsState *cstate = &squeue->sq_consumers[i]; LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); /* * If producer being reset before it is reached the end of the * result set, that means consumer probably would not get all * the rows and it should report error if the consumer's parent ever * try to read. No need to raise error if consumer is just closed. * If consumer is done already we do not need to change the status. */ if (cstate->cs_status != CONSUMER_EOF && cstate->cs_status != CONSUMER_DONE) { elog(DEBUG1, "SQueue %s, reset consumer at %d, " "consumer node %d, pid %d, status %d - marking CONSUMER_ERROR", squeue->sq_key, i, cstate->cs_node, cstate->cs_pid, cstate->cs_status); cstate->cs_status = CONSUMER_ERROR; /* discard tuples which may already be in the queue */ cstate->cs_ntuples = 0; /* keep consistent with cs_ntuples*/ cstate->cs_qreadpos = cstate->cs_qwritepos = 0; /* wake up consumer if it is sleeping */ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); /* Tell producer about change in the state */ SetLatch(&sqsync->sqs_producer_latch); } LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } } else { ConsState *cstate = &(squeue->sq_consumers[consumerIdx]); elog(DEBUG1, "SQueue %s, requested to reset consumer at %d, " "consumer node %d, pid %d, status %d", squeue->sq_key, consumerIdx, cstate->cs_node, cstate->cs_pid, cstate->cs_status); LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE); if (cstate->cs_status != CONSUMER_DONE) { elog(DEBUG1, "SQueue %s, consumer at %d, " "consumer node %d, pid %d, status %d - marking CONSUMER_DONE", squeue->sq_key, consumerIdx, cstate->cs_node, cstate->cs_pid, cstate->cs_status); /* Inform producer the consumer have done the job */ cstate->cs_status = CONSUMER_DONE; /* * No longer need to receive notifications. If consumer has not * connected the latch is not owned */ if (cstate->cs_pid > 0) DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch); /* * notify the producer, it may be waiting while consumers * are finishing */ SetLatch(&sqsync->sqs_producer_latch); } LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock); } LWLockRelease(sqsync->sqs_producer_lwlock); } /* * Disconnect a remote consumer for the given shared queue. * * A node may not join a shared queue in certain circumstances such as when the * other side of the join has not produced any rows and the RemoteSubplan is * not at all executed on the node. Even in that case, we should receive a * 'statement close' message from the remote node and mark that specific * consumer as DONE. */ void SharedQueueDisconnectConsumer(const char *sqname) { bool found; SharedQueue squeue; int i; SQueueSync *sqsync; /* * Be prepared to be called even when there are no shared queues setup. */ if (!SharedQueues) return; LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); squeue = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found); if (!found || squeue->sq_pid == 0) { /* * If the shared queue with the given name is not found or if the * producer has not yet bound, nothing is done. * * XXX Is it possible that the producer binds after this remote * consumer has closed the statement? If that happens, the prodcuer * will not know that this consumer is not going to connect. We * need to study this further and make adjustments if necessary. */ LWLockRelease(SQueuesLock); return; } sqsync = squeue->sq_sync; LWLockAcquire(sqsync->sqs_producer_lwlock, LW_EXCLUSIVE); LWLockRelease(SQueuesLock); /* check queue states */ for (i = 0; i < squeue->sq_nconsumers; i++) { ConsState *cstate = &squeue->sq_consumers[i]; LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); if (cstate->cs_node == PGXC_PARENT_NODE_ID) { cstate->cs_status = CONSUMER_DONE; /* discard tuples which may already be in the queue */ cstate->cs_ntuples = 0; /* keep consistent with cs_ntuples*/ cstate->cs_qreadpos = cstate->cs_qwritepos = 0; } LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } SetLatch(&sqsync->sqs_producer_latch); LWLockRelease(sqsync->sqs_producer_lwlock); } /* * Assume that not yet connected consumers won't connect and reset them. * That should allow to Finish/UnBind the queue gracefully and prevent * producer hanging. */ void SharedQueueResetNotConnected(SharedQueue squeue) { SQueueSync *sqsync = squeue->sq_sync; int result = 0; int i; elog(DEBUG1, "SQueue %s, resetting all unconnected consumers", squeue->sq_key); LWLockAcquire(squeue->sq_sync->sqs_producer_lwlock, LW_EXCLUSIVE); /* check queue states */ for (i = 0; i < squeue->sq_nconsumers; i++) { ConsState *cstate = &squeue->sq_consumers[i]; LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); if (cstate->cs_pid == 0 && cstate->cs_status != CONSUMER_DONE) { result++; elog(DEBUG1, "SQueue %s, consumer at %d, consumer node %d, pid %d, " "status %d is cancelled - marking CONSUMER_ERROR", squeue->sq_key, i, cstate->cs_node, cstate->cs_pid, cstate->cs_status); cstate->cs_status = CONSUMER_DONE; /* discard tuples which may already be in the queue */ cstate->cs_ntuples = 0; /* keep consistent with cs_ntuples*/ cstate->cs_qreadpos = cstate->cs_qwritepos = 0; /* wake up consumer if it is sleeping */ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); } LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } LWLockRelease(sqsync->sqs_producer_lwlock); } /* * Wait on the producer latch, for timeout msec. If timeout occurs, return * true, else return false. */ bool SharedQueueWaitOnProducerLatch(SharedQueue squeue, long timeout) { SQueueSync *sqsync = squeue->sq_sync; int rc = WaitLatch(&sqsync->sqs_producer_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT, timeout, WAIT_EVENT_MQ_INTERNAL); ResetLatch(&sqsync->sqs_producer_latch); return (rc & WL_TIMEOUT); } /* * Determine if producer can safely pause work. * The producer can pause if all consumers have enough data to read while * producer is sleeping. * Obvoius case when the producer can not pause if at least one queue is empty. */ bool SharedQueueCanPause(SharedQueue squeue) { SQueueSync *sqsync = squeue->sq_sync; bool result = true; int usedspace; int ncons; int i; usedspace = 0; ncons = 0; for (i = 0; result && (i < squeue->sq_nconsumers); i++) { ConsState *cstate = &(squeue->sq_consumers[i]); LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_SHARED); /* * Count only consumers that may be blocked. * If producer has finished scanning and pushing local buffers some * consumers may be finished already. */ if (cstate->cs_status == CONSUMER_ACTIVE) { /* can not pause if some queue is empty */ result = (cstate->cs_ntuples > 0); usedspace += (cstate->cs_qwritepos > cstate->cs_qreadpos ? cstate->cs_qwritepos - cstate->cs_qreadpos : cstate->cs_qlength + cstate->cs_qwritepos - cstate->cs_qreadpos); ncons++; } LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } if (!ncons) return false; /* * Pause only if average consumer queue is full more then on half. */ if (result) result = (usedspace / ncons > squeue->sq_consumers[0].cs_qlength / 2); #ifdef SQUEUE_STAT if (result) squeue->stat_paused++; #endif return result; } int SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc, Tuplestorestate **tuplestore) { SQueueSync *sqsync = squeue->sq_sync; TupleTableSlot *tmpslot = NULL; int i; int nstores = 0; elog(DEBUG1, "SQueue %s, finishing the SQueue - producer node %d, " "pid %d, nconsumers %d", squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid, squeue->sq_nconsumers); for (i = 0; i < squeue->sq_nconsumers; i++) { ConsState *cstate = &squeue->sq_consumers[i]; LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); #ifdef SQUEUE_STAT if (!squeue->stat_finish) elog(DEBUG1, "Finishing %s node %d, %ld writes and %ld reads so far, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer", squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns); #endif elog(DEBUG1, "SQueue %s finishing, consumer at %d, consumer node %d, pid %d, " "status %d", squeue->sq_key, i, cstate->cs_node, cstate->cs_pid, cstate->cs_status); /* * if the tuplestore has data and consumer queue has space for some * try to push rows to the queue. We do not want to do that often * to avoid overhead of temp tuple slot allocation. */ if (tuplestore[i]) { /* If the consumer is not reading just destroy the tuplestore */ if (cstate->cs_status != CONSUMER_ACTIVE) { tuplestore_end(tuplestore[i]); tuplestore[i] = NULL; } else { nstores++; /* * Attempt to dump tuples from the store require tuple slot * allocation, that is not a cheap operation, so proceed if * target queue has enough space. */ if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2) { if (tmpslot == NULL) tmpslot = MakeSingleTupleTableSlot(tupDesc); if (SharedQueueDump(squeue, i, tmpslot, tuplestore[i])) { tuplestore_end(tuplestore[i]); tuplestore[i] = NULL; cstate->cs_status = CONSUMER_EOF; nstores--; } /* Consumer may be sleeping, wake it up */ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); /* * XXX This can only be called by the producer. So no need * to set producer latch. */ } } } else { /* it set eof if not yet set */ if (cstate->cs_status == CONSUMER_ACTIVE) { cstate->cs_status = CONSUMER_EOF; SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); /* * XXX This can only be called by the producer. So no need to * set producer latch. */ } } LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } if (tmpslot) ExecDropSingleTupleTableSlot(tmpslot); #ifdef SQUEUE_STAT squeue->stat_finish = true; #endif return nstores; } /* * SharedQueueUnBind * Cancel binding of current process to the shared queue. If the process * was a producer it should pass in the array of tuplestores where tuples were * queueed when it was unsafe to block. If any of the tuplestores holds data * rows they are written to the queue. The length of the array of the * tuplestores should be the same as the count of consumers. It is OK if some * entries are NULL. When a consumer unbinds from the shared queue it should * set the tuplestore parameter to NULL. */ void SharedQueueUnBind(SharedQueue squeue, bool failed) { SQueueSync *sqsync = squeue->sq_sync; int wait_result = 0; int i = 0; int consumer_running = 0; elog(DEBUG1, "SQueue %s, unbinding the SQueue (failed: %c) - producer node %d, " "pid %d, nconsumers %d", squeue->sq_key, failed ? 'T' : 'F', squeue->sq_nodeid, squeue->sq_pid, squeue->sq_nconsumers); CHECK: /* loop while there are active consumers */ for (;;) { int i; int c_count = 0; int unbound_count = 0; LWLockAcquire(sqsync->sqs_producer_lwlock, LW_EXCLUSIVE); /* check queue states */ for (i = 0; i < squeue->sq_nconsumers; i++) { ConsState *cstate = &squeue->sq_consumers[i]; LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); elog(DEBUG1, "SQueue %s unbinding, check consumer at %d, consumer node %d, pid %d, " "status %d", squeue->sq_key, i, cstate->cs_node, cstate->cs_pid, cstate->cs_status); /* is consumer working yet ? */ if (cstate->cs_status == CONSUMER_ACTIVE && failed) { elog(DEBUG1, "SQueue %s, consumer status CONSUMER_ACTIVE, but " "the operation has failed - marking CONSUMER_ERROR", squeue->sq_key); cstate->cs_status = CONSUMER_ERROR; } else if (cstate->cs_status != CONSUMER_DONE && !failed) { elog(DEBUG1, "SQueue %s, consumer not yet done, wake it up and " "wait for it to finish reading", squeue->sq_key); c_count++; /* Wake up consumer if it is sleeping */ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch); /* producer will continue waiting */ ResetLatch(&sqsync->sqs_producer_latch); if (cstate->cs_pid == 0) unbound_count++; } LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } LWLockRelease(sqsync->sqs_producer_lwlock); if (c_count == 0) break; elog(DEBUG1, "SQueue %s, wait while %d consumers finish, %d consumers" "not yet bound", squeue->sq_key, c_count, unbound_count); /* wait for a notification */ wait_result = WaitLatch(&sqsync->sqs_producer_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT, 10000L, WAIT_EVENT_MQ_INTERNAL); /* * If we hit a timeout, reset the consumers which still hasn't * connected. We already make an assumption that consumers that don't * connect in time, would never connect and drop those consumers. * * XXX Unfortunately, while this is not the best way to handle the * problem, we have not found a reliable way to tell whether a specific * consumer will ever connect or not. So this kludge at least avoids a * infinite hang. */ if (wait_result & WL_TIMEOUT) SharedQueueResetNotConnected(squeue); } #ifdef SQUEUE_STAT elog(DEBUG1, "Producer %s is done, there were %ld pauses", squeue->sq_key, squeue->stat_paused); #endif elog(DEBUG1, "SQueue %s, producer node %d, pid %d - unbound successfully", squeue->sq_key, squeue->sq_nodeid, squeue->sq_pid); LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); LWLockAcquire(sqsync->sqs_producer_lwlock, LW_EXCLUSIVE); /* * In rear situation, after consumers just bind to the shared queue, the producer timeout and remove the shared queue. * This will cause a SEGV in the consumer. So here recheck if there are some consumers binded to the queue, if so, we need to wait them to * finish. */ consumer_running = 0; for (i = 0; i < squeue->sq_nconsumers; i++) { ConsState *cstate = &squeue->sq_consumers[i]; LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); /* found a consumer running */ if (CONSUMER_ACTIVE == cstate->cs_status && cstate->cs_pid != 0) { elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, status %d, " "started running after we finished unbind", squeue->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status); consumer_running++; } LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } if (consumer_running) { elog(DEBUG1, "SQueue %s have %d consumers started running after we " "unbound, recheck now", squeue->sq_key, consumer_running); LWLockRelease(sqsync->sqs_producer_lwlock); LWLockRelease(SQueuesLock); goto CHECK; } /* All is done, clean up */ DisownLatch(&sqsync->sqs_producer_latch); if (--squeue->sq_refcnt == 0) { /* Now it is OK to remove hash table entry */ squeue->sq_sync = NULL; sqsync->queue = NULL; if (hash_search(SharedQueues, squeue->sq_key, HASH_REMOVE, NULL) != squeue) elog(PANIC, "Shared queue data corruption"); } LWLockRelease(sqsync->sqs_producer_lwlock); LWLockRelease(SQueuesLock); } /* * If queue with specified name still exists set mark respective consumer as * "Done". Due to executor optimization consumer may never connect the queue, * and should allow producer to finish it up if it is known the consumer will * never connect. */ void SharedQueueRelease(const char *sqname) { bool found; volatile SharedQueue sq; LWLockAcquire(SQueuesLock, LW_EXCLUSIVE); sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found); if (found) { volatile SQueueSync *sqsync = sq->sq_sync; int i; Assert(sqsync && sqsync->queue == sq); elog(DEBUG1, "SQueue %s producer node %d, pid %d - requested to release", sqname, sq->sq_nodeid, sq->sq_pid); LWLockAcquire(sqsync->sqs_producer_lwlock, LW_EXCLUSIVE); /* * If the SharedQ is not bound, we can't just remove it because * somebody might have just created a fresh entry and is going to bind * to it soon. We assume that the future producer will eventually * release the SharedQ */ if (sq->sq_nodeid == -1) { elog(DEBUG1, "SQueue %s, producer not bound ", sqname); LWLockRelease(sqsync->sqs_producer_lwlock); goto done; } /* * Do not bother releasing producer, all necessary work will be * done upon UnBind. */ if (sq->sq_nodeid != PGXC_PARENT_NODE_ID) { elog(DEBUG1, "SQueue %s, we are consumer from node %d", sqname, PGXC_PARENT_NODE_ID); /* find specified node in the consumer lists */ for (i = 0; i < sq->sq_nconsumers; i++) { ConsState *cstate = &(sq->sq_consumers[i]); if (cstate->cs_node == PGXC_PARENT_NODE_ID) { LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); elog(DEBUG1, "SQueue %s, consumer node %d, pid %d, " "status %d", sq->sq_key, cstate->cs_node, cstate->cs_pid, cstate->cs_status); /* * If the consumer pid is not set, we are looking at a race * condition where the old producer (which supplied the * tuples to this remote datanode) may have finished and * marked all consumers as CONSUMER_EOF, the consumers * themeselves consumed all the tuples and marked * themselves as CONSUMER_DONE. The old producer in that * case may have actually removed the SharedQ from shared * memory. But if a new execution for this same portal * comes before the consumer sends a "Close Portal" message * (which subsequently calls this function), we may end up * corrupting state for the upcoming consumer for this new * execution of the portal. * * It seems best to just ignore the release call in such * cases. */ if (cstate->cs_pid == 0) { elog(DEBUG1, "SQueue %s, consumer node %d, already released", sq->sq_key, cstate->cs_node); } else if (cstate->cs_status != CONSUMER_DONE) { /* Inform producer the consumer have done the job */ cstate->cs_status = CONSUMER_DONE; /* no need to receive notifications */ if (cstate->cs_pid > 0) { DisownLatch(&sqsync->sqs_consumer_sync[i].cs_latch); cstate->cs_pid = 0; } /* * notify the producer, it may be waiting while * consumers are finishing */ SetLatch(&sqsync->sqs_producer_latch); elog(DEBUG1, "SQueue %s, release consumer at %d, node " "%d, pid %d, status %d ", sqname, i, cstate->cs_node, cstate->cs_pid, cstate->cs_status); } LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); LWLockRelease(sqsync->sqs_producer_lwlock); /* exit */ goto done; } } elog(DEBUG1, "SQueue %s, consumer from node %d never bound", sqname, PGXC_PARENT_NODE_ID); /* * The consumer was never bound. Find empty consumer slot and * register node here to let producer know that the node will never * be consuming. */ for (i = 0; i < sq->sq_nconsumers; i++) { ConsState *cstate = &(sq->sq_consumers[i]); if (cstate->cs_node == -1) { LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE); /* Inform producer the consumer have done the job */ cstate->cs_status = CONSUMER_DONE; SetLatch(&sqsync->sqs_producer_latch); elog(DEBUG1, "SQueue %s, consumer at %d marking as " "CONSUMER_DONE", sqname, i); LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock); } } } LWLockRelease(sqsync->sqs_producer_lwlock); } done: /* * If we are the last holder of the SQueue, remove it from the hash table * to avoid any leak */ if (sq && --sq->sq_refcnt == 0) { /* Now it is OK to remove hash table entry */ sq->sq_sync->queue = NULL; sq->sq_sync = NULL; if (hash_search(SharedQueues, sq->sq_key, HASH_REMOVE, NULL) != sq) elog(PANIC, "Shared queue data corruption"); } LWLockRelease(SQueuesLock); } /* * Called when the backend is ending. */ void SharedQueuesCleanup(int code, Datum arg) { /* Need to be able to look into catalogs */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "SharedQueuesCleanup"); /* * Release all registered prepared statements. * If a shared queue name is associated with the statement this queue will * be released. */ DropAllPreparedStatements(); /* Release everything */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, true, true); ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_LOCKS, true, true); ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_AFTER_LOCKS, true, true); CurrentResourceOwner = NULL; } /* * sq_push_long_tuple * Routine to push through the consumer state tuple longer the the consumer * queue. Long tuple is written by a producer partially, and only when the * consumer queue is empty. * The consumer can determine that the tuple being read is long if the length * of the tuple which is read before data is exceeding queue length. * Consumers is switching to the long tuple mode and read in the portion of * data which is already in the queue. After reading in each portion of data * consumer sets cs_ntuples to LONG_TUPLE to indicate it is in long tuple * mode, and writes out number of already read bytes to the beginning of the * queue. * While Consumer is reading in tuple data Producer may work on other task: * execute query and send tuples to other Customers. If Producer sees the * LONG_TUPLE indicator it may write out next portion. The tuple remains * current in the tuplestore, and Producer just needs to read offset from * the buffer to know what part of data to write next. * After tuple is completely written the Producer is advancing to next tuple * and continue operation in normal mode. */ static bool sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow) { if (cstate->cs_ntuples == 0) { /* the tuple is too big to fit the queue, start pushing it through */ int len; /* * Output actual message size, to prepare consumer: * allocate memory and set up transmission. */ QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen); /* Output as much as possible */ len = cstate->cs_qlength - sizeof(int); Assert(datarow->msglen > len); QUEUE_WRITE(cstate, len, datarow->msg); cstate->cs_ntuples = 1; return false; } else { int offset; int len; /* Continue pushing through long tuple */ Assert(cstate->cs_ntuples == LONG_TUPLE); /* * Consumer outputs number of bytes already read at the beginning of * the queue. */ memcpy(&offset, cstate->cs_qstart, sizeof(int)); Assert(offset > 0 && offset < datarow->msglen); /* remaining data */ len = datarow->msglen - offset; /* * We are sending remaining lengs just for sanity check at the consumer * side */ QUEUE_WRITE(cstate, sizeof(int), (char *) &len); if (len > cstate->cs_qlength - sizeof(int)) { /* does not fit yet */ len = cstate->cs_qlength - sizeof(int); QUEUE_WRITE(cstate, len, datarow->msg + offset); cstate->cs_ntuples = 1; return false; } else { /* now we are done */ QUEUE_WRITE(cstate, len, datarow->msg + offset); cstate->cs_ntuples = 1; return true; } } } /* * sq_pull_long_tuple * Read in from the queue data of a long tuple which does not the queue. * See sq_push_long_tuple for more details * * The function is entered with LWLocks held on the consumer as well as * procuder sync. The function exits with both of those locks held, even * though internally it may release those locks before going to sleep. */ static void sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow, int consumerIdx, SQueueSync *sqsync) { int offset = 0; int len = datarow->msglen; ConsumerSync *sync = &sqsync->sqs_consumer_sync[consumerIdx]; for (;;) { /* determine how many bytes to read */ if (len > cstate->cs_qlength - sizeof(int)) len = cstate->cs_qlength - sizeof(int); /* read data */ QUEUE_READ(cstate, len, datarow->msg + offset); /* remember how many we read already */ offset += len; /* check if we are done */ if (offset == datarow->msglen) return; /* need more, set up queue to accept data from the producer */ Assert(cstate->cs_ntuples == 1); /* allow exactly one incomplete tuple */ cstate->cs_ntuples = LONG_TUPLE; /* long tuple mode marker */ /* Inform producer how many bytes we have already */ memcpy(cstate->cs_qstart, &offset, sizeof(int)); /* Release locks and wait until producer supply more data */ while (cstate->cs_ntuples == LONG_TUPLE) { /* * First up wake the producer */ SetLatch(&sqsync->sqs_producer_latch); /* * We must reset the consumer latch while holding the lock to * ensure the producer can't change the state in between. */ ResetLatch(&sync->cs_latch); /* * Now release all locks before going into a wait state */ LWLockRelease(sync->cs_lwlock); LWLockRelease(sqsync->sqs_producer_lwlock); /* Wait for notification about available info */ WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1, WAIT_EVENT_MQ_INTERNAL); /* got the notification, restore lock and try again */ LWLockAcquire(sqsync->sqs_producer_lwlock, LW_SHARED); LWLockAcquire(sync->cs_lwlock, LW_EXCLUSIVE); } /* Read length of remaining data */ QUEUE_READ(cstate, sizeof(int), (char *) &len); /* Make sure we are doing the same tuple */ Assert(offset + len == datarow->msglen); /* next iteration */ } }