/*------------------------------------------------------------------------- * * pause.c * * Cluster Pause/Unpause handling * * IDENTIFICATION * $$ * *------------------------------------------------------------------------- */ #ifdef XCP #include "postgres.h" #include "pgxc/execRemote.h" #include "pgxc/pause.h" #include "pgxc/pgxc.h" #include "storage/shmem.h" #include "storage/spin.h" #include "miscadmin.h" /* globals */ bool cluster_lock_held; bool cluster_ex_lock_held; static void HandleClusterPause(bool pause, bool initiator); static void ProcessClusterPauseRequest(bool pause); ClusterLockInfo *ClustLinfo = NULL; /* * ProcessClusterPauseRequest: * * Carry out PAUSE/UNPAUSE request on a coordinator node */ static void ProcessClusterPauseRequest(bool pause) { char *action = pause? "PAUSE":"UNPAUSE"; if (!IS_PGXC_COORDINATOR || !IsConnFromCoord()) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("The %s CLUSTER message is expected to " "arrive at a coordinator from another coordinator", action))); elog(DEBUG2, "Received %s CLUSTER from a coordinator", action); /* * If calling UNPAUSE, ensure that the cluster lock has already been held * in exclusive mode */ if (!pause && !cluster_ex_lock_held) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Received an UNPAUSE request when cluster not PAUSED!"))); /* * Enable/Disable local queries. We need to release the lock first * * TODO: Think of some timeout mechanism here, if the locking takes too * much time... */ ReleaseClusterLock(pause? false:true); AcquireClusterLock(pause? true:false); if (pause) cluster_ex_lock_held = true; else cluster_ex_lock_held = false; elog(DEBUG2, "%s queries at the coordinator", pause? "Paused":"Resumed"); return; } /* * HandleClusterPause: * * Any errors will be reported via ereport. */ static void HandleClusterPause(bool pause, bool initiator) { PGXCNodeAllHandles *coord_handles; int conn; int response; char *action = pause? "PAUSE":"UNPAUSE"; elog(DEBUG2, "Preparing coordinators for %s CLUSTER", action); if (pause && cluster_ex_lock_held) { ereport(NOTICE, (errmsg("CLUSTER already PAUSED"))); /* Nothing to do */ return; } if (!pause && !cluster_ex_lock_held) { ereport(NOTICE, (errmsg("Issue PAUSE CLUSTER before calling UNPAUSE"))); /* Nothing to do */ return; } /* * If we are one of the participating coordinators, just do the action * locally and return */ if (!initiator) { ProcessClusterPauseRequest(pause); return; } /* * Send a PAUSE/UNPAUSE CLUSTER message to all the coordinators. We should send an * asyncronous request, update the local ClusterLock and then wait for the remote * coordinators to respond back */ coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true); for (conn = 0; conn < coord_handles->co_conn_count; conn++) { PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; if (pgxc_node_send_query(handle, pause? "PAUSE CLUSTER" : "UNPAUSE CLUSTER") != 0) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send %s CLUSTER request to some coordinator nodes",action))); } /* * Disable/Enable local queries. We need to release the SHARED mode first * * TODO: Start a timer to cancel the request in case of a timeout */ ReleaseClusterLock(pause? false:true); AcquireClusterLock(pause? true:false); if (pause) cluster_ex_lock_held = true; else cluster_ex_lock_held = false; elog(DEBUG2, "%s queries at the driving coordinator", pause? "Paused":"Resumed"); /* * Local queries are paused/enabled. Check status of the remote coordinators * now. We need a TRY/CATCH block here, so that if one of the coordinator * fails for some reason, we can try best-effort to salvage the situation * at others * * We hope that errors in the earlier loop generally do not occur (out of * memory and improper handles..) or we can have a similar TRY/CATCH block * there too * * To repeat: All the salvaging is best effort really... */ PG_TRY(); { ResponseCombiner combiner; InitResponseCombiner(&combiner, coord_handles->co_conn_count, COMBINE_TYPE_NONE); for (conn = 0; conn < coord_handles->co_conn_count; conn++) { PGXCNodeHandle *handle; handle = coord_handles->coord_handles[conn]; while (true) { if (pgxc_node_receive(1, &handle, NULL)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to receive a response from the remote coordinator node"))); response = handle_response(handle, &combiner); if (response == RESPONSE_EOF) continue; else if (response == RESPONSE_COMPLETE) break; else ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("%s CLUSTER command failed " "with error %s", action, handle->error))); } } if (combiner.errorMessage) { char *code = combiner.errorCode; if (combiner.errorDetail != NULL) ereport(ERROR, (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), errmsg("%s", combiner.errorMessage), errdetail("%s", combiner.errorDetail) )); else ereport(ERROR, (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), errmsg("%s", combiner.errorMessage))); } CloseCombiner(&combiner); } PG_CATCH(); { /* * If PAUSE CLUSTER, issue UNPAUSE on the reachable nodes. For failure * in cases of UNPAUSE, might need manual intervention at the offending * coordinator node (maybe do a pg_cancel_backend() on the backend * that's holding the exclusive lock or something..) */ if (!pause) ereport(WARNING, (errmsg("UNPAUSE CLUSTER command failed on one or more coordinator nodes." " Manual intervention may be required!"))); else ereport(WARNING, (errmsg("PAUSE CLUSTER command failed on one or more coordinator nodes." " Trying to UNPAUSE reachable nodes now"))); for (conn = 0; conn < coord_handles->co_conn_count && pause; conn++) { PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; (void) pgxc_node_send_query(handle, "UNPAUSE CLUSTER"); /* * The incoming data should hopefully be discarded as part of * cleanup.. */ } /* cleanup locally.. */ ReleaseClusterLock(pause? true:false); AcquireClusterLock(pause? false:true); cluster_ex_lock_held = false; PG_RE_THROW(); } PG_END_TRY(); elog(DEBUG2, "Successfully completed %s CLUSTER command on " "all coordinator nodes", action); return; } void RequestClusterPause(bool pause, char *completionTag) { char *action = pause? "PAUSE":"UNPAUSE"; bool initiator = true; elog(DEBUG2, "%s CLUSTER request received", action); /* Only a superuser can perform this activity on a cluster */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("%s CLUSTER command: must be a superuser", action))); /* Ensure that we are a coordinator */ if (!IS_PGXC_COORDINATOR) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("%s CLUSTER command must be sent to a coordinator", action))); /* * Did the command come directly to this coordinator or via another * coordinator? */ if (IsConnFromCoord()) initiator = false; HandleClusterPause(pause, initiator); if (completionTag) snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "%s CLUSTER", action); } /* * If the backend is shutting down, cleanup the PAUSE cluster lock * appropriately. We do this before shutting down shmem, because this needs * LWLock and stuff */ void PGXCCleanClusterLock(int code, Datum arg) { PGXCNodeAllHandles *coord_handles; int conn; if (cluster_lock_held && !cluster_ex_lock_held) { ReleaseClusterLock (false); cluster_lock_held = false; } /* Do nothing if cluster lock not held */ if (!cluster_ex_lock_held) return; /* Do nothing if we are not the initiator */ if (IsConnFromCoord()) return; coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true); /* Try best-effort to UNPAUSE other coordinators now */ for (conn = 0; conn < coord_handles->co_conn_count; conn++) { PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; /* No error checking here... */ (void)pgxc_node_send_query(handle, "UNPAUSE CLUSTER"); } /* Release locally too. We do not want a dangling value in cl_holder_pid! */ ReleaseClusterLock(true); cluster_ex_lock_held = false; } /* Report shared memory space needed by ClusterLockShmemInit */ Size ClusterLockShmemSize(void) { Size size = 0; size = add_size(size, sizeof(ClusterLockInfo)); return size; } /* Allocate and initialize cluster locking related shared memory */ void ClusterLockShmemInit(void) { bool found; ClustLinfo = (ClusterLockInfo *) ShmemInitStruct("Cluster Lock Info", ClusterLockShmemSize(), &found); if (!found) { /* First time through, so initialize */ MemSet(ClustLinfo, 0, ClusterLockShmemSize()); SpinLockInit(&ClustLinfo->cl_mutex); } } /* * AcquireClusterLock * * Based on the argument passed in, try to update the shared memory * appropriately. In case the conditions cannot be satisfied immediately this * function resorts to a simple sleep. We don't envision PAUSE CLUSTER to * occur that frequently so most of the calls will come out immediately here * without any sleeps at all * * We could have used a semaphore to allow the processes to sleep while the * cluster lock is held. But again we are really not worried about performance * and immediate wakeups around PAUSE CLUSTER functionality. Using the sleep * in an infinite loop keeps things simple yet correct */ void AcquireClusterLock(bool exclusive) { volatile ClusterLockInfo *clinfo = ClustLinfo; if (exclusive && cluster_ex_lock_held) { return; } /* * In the normal case, none of the backends will ask for exclusive lock, so * they will just update the cl_process_count value and exit immediately * from the below loop */ for (;;) { bool wait = false; SpinLockAcquire(&clinfo->cl_mutex); if (!exclusive) { if (clinfo->cl_holder_pid == 0) clinfo->cl_process_count++; else wait = true; } else /* PAUSE CLUSTER handling */ { if (clinfo->cl_holder_pid != 0) { SpinLockRelease(&clinfo->cl_mutex); ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("PAUSE CLUSTER already in progress"))); } /* * There should be no other process * holding the lock including ourself */ if (clinfo->cl_process_count > 0) wait = true; else clinfo->cl_holder_pid = MyProcPid; } SpinLockRelease(&clinfo->cl_mutex); /* * We use a simple sleep mechanism. If PAUSE CLUSTER has been invoked, * we are not worried about immediate performance characteristics.. */ if (wait) { CHECK_FOR_INTERRUPTS(); pg_usleep(100000L); } else /* Got the proper semantic read/write lock.. */ break; } } /* * ReleaseClusterLock * * Update the shared memory appropriately across the release call. We * really do not need the bool argument, but it's there for some * additional sanity checking */ void ReleaseClusterLock(bool exclusive) { volatile ClusterLockInfo *clinfo = ClustLinfo; SpinLockAcquire(&clinfo->cl_mutex); if (exclusive) { if (clinfo->cl_process_count > 1 || clinfo->cl_holder_pid == 0) { SpinLockRelease(&clinfo->cl_mutex); ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Inconsistent state while doing UNPAUSE CLUSTER"))); } /* * Reset the holder pid. Any waiters in AcquireClusterLock will * eventually come out of their sleep and notice this new value and * move ahead */ clinfo->cl_holder_pid = 0; } else { if (clinfo->cl_holder_pid != 0) { SpinLockRelease(&clinfo->cl_mutex); ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Inconsistent state while releasing CLUSTER lock"))); } /* * Decrement our count. If a PAUSE is waiting inside AcquireClusterLock * elsewhere, it will wake out of sleep and do the needful */ if (clinfo->cl_process_count > 0) clinfo->cl_process_count--; } SpinLockRelease(&clinfo->cl_mutex); } #endif