Skip to content

Commit fe0972e

Browse files
committed
Add further debug info to help debug 019_replslot_limit.pl failures.
See also afdeff1. Failures after that commit provided a few more hints, but not yet enough to understand what's going on. In 019_replslot_limit.pl shut down nodes with fast instead of immediate mode if we observe the failure mode. That should tell us whether the failures we're observing are just a timing issue under high load. PGCTLTIMEOUT should prevent buildfarm animals from hanging endlessly. Also adds a bit more logging to replication slot drop and ShutdownPostgres(). Discussion: https://postgr.es/m/[email protected]
1 parent 638300f commit fe0972e

File tree

5 files changed

+55
-1
lines changed

5 files changed

+55
-1
lines changed

src/backend/replication/slot.c

+13
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,10 @@ ReplicationSlotCleanup(void)
569569
if (!s->in_use)
570570
continue;
571571

572+
/* unlocked read of active_pid is ok for debugging purposes */
573+
elog(DEBUG3, "temporary replication slot cleanup: %d in use, active_pid: %d",
574+
i, s->active_pid);
575+
572576
SpinLockAcquire(&s->mutex);
573577
if (s->active_pid == MyProcPid)
574578
{
@@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
629633
char path[MAXPGPATH];
630634
char tmppath[MAXPGPATH];
631635

636+
/* temp debugging aid to analyze 019_replslot_limit failures */
637+
elog(DEBUG3, "replication slot drop: %s: begin", NameStr(slot->data.name));
638+
632639
/*
633640
* If some other backend ran this code concurrently with us, we might try
634641
* to delete a slot with a certain name while someone else was trying to
@@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
679686
path, tmppath)));
680687
}
681688

689+
elog(DEBUG3, "replication slot drop: %s: removed on-disk",
690+
NameStr(slot->data.name));
691+
682692
/*
683693
* The slot is definitely gone. Lock out concurrent scans of the array
684694
* long enough to kill it. It's OK to clear the active PID here without
@@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
734744
* a slot while we're still cleaning up the detritus of the old one.
735745
*/
736746
LWLockRelease(ReplicationSlotAllocationLock);
747+
748+
elog(DEBUG3, "replication slot drop: %s: done",
749+
NameStr(slot->data.name));
737750
}
738751

739752
/*

src/backend/storage/lmgr/lwlock.c

+7
Original file line numberDiff line numberDiff line change
@@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
19451945
}
19461946
return false;
19471947
}
1948+
1949+
/* temp debugging aid to analyze 019_replslot_limit failures */
1950+
int
1951+
LWLockHeldCount(void)
1952+
{
1953+
return num_held_lwlocks;
1954+
}

src/backend/utils/init/postinit.c

+17
Original file line numberDiff line numberDiff line change
@@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)
12621262
* them explicitly.
12631263
*/
12641264
LockReleaseAll(USER_LOCKMETHOD, true);
1265+
1266+
/*
1267+
* temp debugging aid to analyze 019_replslot_limit failures
1268+
*
1269+
* If an error were thrown outside of a transaction nothing up to now
1270+
* would have released lwlocks. We probably will add an
1271+
* LWLockReleaseAll(). But for now make it easier to understand such cases
1272+
* by warning if any lwlocks are held.
1273+
*/
1274+
#ifdef USE_ASSERT_CHECKING
1275+
{
1276+
int held_lwlocks = LWLockHeldCount();
1277+
if (held_lwlocks)
1278+
elog(WARNING, "holding %d lwlocks at the end of ShutdownPostgres()",
1279+
held_lwlocks);
1280+
}
1281+
#endif
12651282
}
12661283

12671284

src/include/storage/lwlock.h

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
121121
extern void LWLockReleaseAll(void);
122122
extern bool LWLockHeldByMe(LWLock *lock);
123123
extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
124+
extern int LWLockHeldCount(void);
124125

125126
extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
126127
extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);

src/test/recovery/t/019_replslot_limit.pl

+17-1
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,23 @@
335335
$node_primary3->wait_for_catchup($node_standby3);
336336
my $senderpid = $node_primary3->safe_psql('postgres',
337337
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
338-
like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
338+
339+
# We've seen occasional cases where multiple walsender pids are active. An
340+
# immediate shutdown may hide evidence of a locking bug. So if multiple
341+
# walsenders are observed, shut down in fast mode, and collect some more
342+
# information.
343+
if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"))
344+
{
345+
my ($stdout, $stderr);
346+
$node_primary3->psql('postgres',
347+
"\\a\\t\nSELECT * FROM pg_stat_activity",
348+
stdout => \$stdout, stderr => \$stderr);
349+
diag $stdout, $stderr;
350+
$node_primary3->stop('fast');
351+
$node_standby3->stop('fast');
352+
die "could not determine walsender pid, can't continue";
353+
}
354+
339355
my $receiverpid = $node_standby3->safe_psql('postgres',
340356
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
341357
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");

0 commit comments

Comments
 (0)