Skip to content

Commit bb86141

Browse files
committed
Kill dead-end children when there's nothing else left
Previously, the postmaster would never try to kill dead-end child processes, even if there were no other processes left. A dead-end backend will eventually exit, when authentication_timeout expires, but if a dead-end backend is the only thing that's preventing the server from shutting down, it seems better to kill it immediately. It's particularly important, if there was a bug in the early startup code that prevented a dead-end child from timing out and exiting normally. Includes a test for that case where a dead-end backend previously prevented the server from shutting down. Reviewed-by: Andres Freund <[email protected]> Discussion: https://www.postgresql.org/message-id/[email protected]
1 parent 18d67a8 commit bb86141

File tree

4 files changed

+116
-10
lines changed

4 files changed

+116
-10
lines changed

src/backend/postmaster/postmaster.c

+9-8
Original file line numberDiff line numberDiff line change
@@ -2985,10 +2985,11 @@ PostmasterStateMachine(void)
29852985
if (Shutdown >= ImmediateShutdown || FatalError)
29862986
{
29872987
/*
2988-
* Start waiting for dead_end children to die. This state
2989-
* change causes ServerLoop to stop creating new ones.
2988+
* Stop any dead_end children and stop creating new ones.
29902989
*/
29912990
pmState = PM_WAIT_DEAD_END;
2991+
ConfigurePostmasterWaitSet(false);
2992+
SignalChildren(SIGQUIT, btmask(B_DEAD_END_BACKEND));
29922993

29932994
/*
29942995
* We already SIGQUIT'd the archiver and stats processes, if
@@ -3027,9 +3028,10 @@ PostmasterStateMachine(void)
30273028
*/
30283029
FatalError = true;
30293030
pmState = PM_WAIT_DEAD_END;
3031+
ConfigurePostmasterWaitSet(false);
30303032

30313033
/* Kill the walsenders and archiver too */
3032-
SignalChildren(SIGQUIT, btmask_all_except(B_DEAD_END_BACKEND));
3034+
SignalChildren(SIGQUIT, BTYPE_MASK_ALL);
30333035
if (PgArchPID != 0)
30343036
signal_child(PgArchPID, SIGQUIT);
30353037
}
@@ -3048,14 +3050,13 @@ PostmasterStateMachine(void)
30483050
if (PgArchPID == 0 && CountChildren(btmask_all_except(B_DEAD_END_BACKEND)) == 0)
30493051
{
30503052
pmState = PM_WAIT_DEAD_END;
3053+
ConfigurePostmasterWaitSet(false);
3054+
SignalChildren(SIGTERM, BTYPE_MASK_ALL);
30513055
}
30523056
}
30533057

30543058
if (pmState == PM_WAIT_DEAD_END)
30553059
{
3056-
/* Don't allow any new socket connection events. */
3057-
ConfigurePostmasterWaitSet(false);
3058-
30593060
/*
30603061
* PM_WAIT_DEAD_END state ends when the BackendList is entirely empty
30613062
* (ie, no dead_end children remain), and the archiver is gone too.
@@ -3381,12 +3382,12 @@ SignalChildren(int signal, BackendTypeMask targetMask)
33813382

33823383
/*
33833384
* Send a termination signal to children. This considers all of our children
3384-
* processes, except syslogger and dead_end backends.
3385+
* processes, except syslogger.
33853386
*/
33863387
static void
33873388
TerminateChildren(int signal)
33883389
{
3389-
SignalChildren(signal, btmask_all_except(B_DEAD_END_BACKEND));
3390+
SignalChildren(signal, BTYPE_MASK_ALL);
33903391
if (StartupPID != 0)
33913392
{
33923393
signal_child(StartupPID, signal);

src/test/perl/PostgreSQL/Test/Cluster.pm

+8-2
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,9 @@ this to fail. Otherwise, tests might fail to detect server crashes.
11941194
With optional extra param fail_ok => 1, returns 0 for failure
11951195
instead of bailing out.
11961196
1197+
The optional extra param timeout can be used to pass the pg_ctl
1198+
--timeout option.
1199+
11971200
=cut
11981201

11991202
sub stop
@@ -1209,8 +1212,11 @@ sub stop
12091212
return 1 unless defined $self->{_pid};
12101213

12111214
print "### Stopping node \"$name\" using mode $mode\n";
1212-
$ret = PostgreSQL::Test::Utils::system_log('pg_ctl', '-D', $pgdata,
1213-
'-m', $mode, 'stop');
1215+
my @cmd = ('pg_ctl', '-D', $pgdata, '-m', $mode, 'stop');
1216+
if ($params{timeout}) {
1217+
push(@cmd, ('--timeout', $params{timeout}));
1218+
}
1219+
$ret = PostgreSQL::Test::Utils::system_log(@cmd);
12141220

12151221
if ($ret != 0)
12161222
{

src/test/postmaster/meson.build

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ tests += {
77
'tap': {
88
'tests': [
99
't/001_connection_limits.pl',
10+
't/002_start_stop.pl',
1011
],
1112
},
1213
}
+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
2+
# Copyright (c) 2021-2024, PostgreSQL Global Development Group
3+
4+
# Test postmaster start and stop state machine.
5+
6+
use strict;
7+
use warnings FATAL => 'all';
8+
use PostgreSQL::Test::Cluster;
9+
use PostgreSQL::Test::Utils;
10+
use Test::More;
11+
12+
#
13+
# Test that dead-end backends don't prevent the server from shutting
14+
# down.
15+
#
16+
# Dead-end backends can linger until they reach
17+
# authentication_timeout. We use a long authentication_timeout and a
18+
# much shorter timeout for the "pg_ctl stop" operation, to test that
19+
# if dead-end backends are killed at fast shut down. If they're not,
20+
# "pg_ctl stop" will error out before the authentication timeout kicks
21+
# in and cleans up the dead-end backends.
22+
my $authentication_timeout = $PostgreSQL::Test::Utils::timeout_default;
23+
my $stop_timeout = $authentication_timeout / 2;
24+
25+
# Initialize the server with low connection limits, to test dead-end backends
26+
my $node = PostgreSQL::Test::Cluster->new('main');
27+
$node->init;
28+
$node->append_conf('postgresql.conf', "max_connections = 5");
29+
$node->append_conf('postgresql.conf', "max_wal_senders = 0");
30+
$node->append_conf('postgresql.conf', "autovacuum_max_workers = 1");
31+
$node->append_conf('postgresql.conf', "max_worker_processes = 1");
32+
$node->append_conf('postgresql.conf', "log_connections = on");
33+
$node->append_conf('postgresql.conf', "log_min_messages = debug2");
34+
$node->append_conf('postgresql.conf',
35+
"authentication_timeout = '$authentication_timeout s'");
36+
$node->append_conf('postgresql.conf', 'trace_connection_negotiation=on');
37+
$node->start;
38+
39+
if (!$node->raw_connect_works())
40+
{
41+
plan skip_all => "this test requires working raw_connect()";
42+
}
43+
44+
my @raw_connections = ();
45+
46+
# Open a lot of TCP (or Unix domain socket) connections to use up all
47+
# the connection slots. Beyond a certain number (roughly 2x
48+
# max_connections), they will be "dead-end backends".
49+
for (my $i = 0; $i <= 20; $i++)
50+
{
51+
my $sock = $node->raw_connect();
52+
53+
# On a busy system, the server might reject connections if
54+
# postmaster cannot accept() them fast enough. The exact limit
55+
# and behavior depends on the platform. To make this reliable,
56+
# we attempt SSL negotiation on each connection before opening
57+
# next one. The server will reject the SSL negotations, but
58+
# when it does so, we know that the backend has been launched
59+
# and we should be able to open another connection.
60+
61+
# SSLRequest packet consists of packet length followed by
62+
# NEGOTIATE_SSL_CODE.
63+
my $negotiate_ssl_code = pack("Nnn", 8, 1234, 5679);
64+
my $sent = $sock->send($negotiate_ssl_code);
65+
66+
# Read reply. We expect the server to reject it with 'N'
67+
my $reply = "";
68+
$sock->recv($reply, 1);
69+
is($reply, "N", "dead-end connection $i");
70+
71+
push(@raw_connections, $sock);
72+
}
73+
74+
# When all the connection slots are in use, new connections will fail
75+
# before even looking up the user. Hence you now get "sorry, too many
76+
# clients already" instead of "role does not exist" error. Test that
77+
# to ensure that we have used up all the slots.
78+
$node->connect_fails("dbname=postgres user=invalid_user",
79+
"connect ",
80+
expected_stderr => qr/FATAL: sorry, too many clients already/);
81+
82+
# Open one more connection, to really ensure that we have at least one
83+
# dead-end backend.
84+
my $sock = $node->raw_connect();
85+
86+
# Test that the dead-end backends don't prevent the server from stopping.
87+
$node->stop('fast', timeout => $stop_timeout);
88+
89+
$node->start();
90+
$node->connect_ok("dbname=postgres", "works after restart");
91+
92+
# Clean up
93+
foreach my $socket (@raw_connections)
94+
{
95+
$socket->close();
96+
}
97+
98+
done_testing();

0 commit comments

Comments
 (0)